news2kindle 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,236 @@
1
+ # scraping internet.watch.impress.co.jp for News2Kindle
2
+ #
3
+
4
+ require 'nokogiri'
5
+ require 'open-uri'
6
+ require 'uri'
7
+ require 'ostruct'
8
+ require 'tmpdir'
9
+ require 'pathname'
10
+ require 'fileutils'
11
+
12
+ module News2Kindle
13
+ module Generator
14
+ class InternetWatch
15
+ TOP = 'https://internet.watch.impress.co.jp'
16
+
17
+ def initialize( tmpdir )
18
+ @current_dir = tmpdir
19
+
20
+ @src_dir = @current_dir + '/src'
21
+ Dir::mkdir( @src_dir )
22
+
23
+ @dst_dir = @current_dir + '/dst'
24
+ Dir::mkdir( @dst_dir )
25
+ FileUtils.cp( "./resource/internet-watch.jpg", @dst_dir )
26
+ FileUtils.cp( "./resource/internet-watch.css", @dst_dir )
27
+ end
28
+
29
+ def generate(opts)
30
+ now = opts[:now]
31
+ items = []
32
+
33
+ rdf_file = "http://rss.rssad.jp/rss/internetwatch/internet.rdf"
34
+ rdf = retry_loop( 5 ) do
35
+ Nokogiri(open(rdf_file, 'r:utf-8', &:read))
36
+ end
37
+ (rdf / 'item' ).each do |item|
38
+ uri = URI( item.attr( 'rdf:about' ).to_s )
39
+ next unless /internet\.watch\.impress\.co\.jp/ =~ uri.host
40
+ uri.query = nil # remove query of 'ref=rss'
41
+ next if News2Kindle::DupChecker.dup?(uri)
42
+
43
+ title = (item / 'title').text
44
+ date = item.elements.map{|e| e.text if e.name == 'date'}.join
45
+ items << OpenStruct::new( :uri => uri, :title => title, :date => date )
46
+ end
47
+ items.sort!{|a,b| a.date <=> b.date}
48
+
49
+ now_str = now.strftime( '%Y-%m-%d %H:%M' )
50
+
51
+ #
52
+ # generating articles in html
53
+ #
54
+ items.each do |item|
55
+ begin
56
+ article = get_article( item.uri )
57
+ open( "#{@dst_dir}/#{item_id item.uri}.html", 'w' ) do |f|
58
+ f.puts html_header( item.title )
59
+ contents = (article / 'div.mainContents')
60
+ (contents / 'img').each do |img|
61
+ org = img.attr('ajax') || img.attr('src')
62
+ next if org =~ /^http/ # skip images on other servers
63
+ begin
64
+ img_file = retry_loop( 5 ) do
65
+ open( "#{TOP}#{org}", &:read )
66
+ end
67
+ cache = "#{org.gsub( /\//, '_' ).sub( /^_/, '' )}"
68
+ open( "#{@dst_dir}/#{cache}", 'w' ){|f| f.write img_file}
69
+ img.set_attribute( 'src', cache )
70
+ rescue OpenURI::HTTPError
71
+ News2Kindle.logger.error "skipped an image: #{TOP}#{org}"
72
+ end
73
+ end
74
+ f.puts contents.inner_html
75
+ f.puts html_footer
76
+ end
77
+ rescue
78
+ News2Kindle.logger.warn "#{$!.class}: #$!"
79
+ News2Kindle.logger.warn "skipped an article: #{item.uri}"
80
+ end
81
+ end
82
+
83
+ #
84
+ # generating TOC in html
85
+ #
86
+ open( "#{@dst_dir}/toc.html", 'w:utf-8' ) do |f|
87
+ f.write html_header( 'Table of Contents' )
88
+ if items.size == 0
89
+ f.puts %Q|<p>本日は記事がありません。</p>|
90
+ else
91
+ f.puts "<ul>"
92
+ items.each do |item|
93
+ f.puts %Q|\t<li><a href="#{item_id item.uri}.html">#{item.title}</a></li>|
94
+ end
95
+ f.puts "</ul>"
96
+ end
97
+ f.write html_footer
98
+ end
99
+
100
+ #
101
+ # generating TOC in ncx
102
+ #
103
+ open( "#{@dst_dir}/toc.ncx", 'w:utf-8' ) do |f|
104
+ f.write <<-XML.gsub( /^\t/, '' )
105
+ <?xml version="1.0" encoding="UTF-8"?>
106
+ <!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
107
+ <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
108
+ <docTitle><text>INTERNET Watch (#{now_str})</text></docTitle>
109
+ <navMap>
110
+ <navPoint id="toc" playOrder="0"><navLabel><text>Table of Contents</text></navLabel><content src="toc.html" /></navPoint>
111
+ XML
112
+
113
+ items.each_with_index do |item, index|
114
+ f.puts %Q|\t\t<navPoint id="#{item_id item.uri}" playOrder="#{index}"><navLabel><text>#{item.title}</text></navLabel><content src="#{item_id item.uri}.html" /></navPoint>|
115
+ end
116
+
117
+ f.write <<-XML.gsub( /^\t/, '' )
118
+ </navMap>
119
+ </ncx>
120
+ XML
121
+ end
122
+
123
+ #
124
+ # generating OPF
125
+ #
126
+ open( "#{@dst_dir}/internet-watch.opf", 'w:utf-8' ) do |f|
127
+ f.write <<-XML.gsub( /^\t/, '' )
128
+ <?xml version="1.0" encoding="utf-8"?>
129
+ <package unique-identifier="uid">
130
+ <metadata>
131
+ <dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/">
132
+ <dc:Title>INTERNET Watch (#{now_str})</dc:Title>
133
+ <dc:Language>ja-JP</dc:Language>
134
+ <dc:Creator>インプレス</dc:Creator>
135
+ <dc:Description>INTERNET Watch、#{now_str}生成</dc:Description>
136
+ <dc:Date>#{now.strftime( '%d/%m/%Y' )}</dc:Date>
137
+ </dc-metadata>
138
+ <x-metadata>
139
+ <output encoding="utf-8" content-type="text/x-oeb1-document"></output>
140
+ <EmbeddedCover>internet-watch.jpg</EmbeddedCover>
141
+ </x-metadata>
142
+ </metadata>
143
+ <manifest>
144
+ <item id="toc" media-type="application/x-dtbncx+xml" href="toc.ncx"></item>
145
+ <item id="style" media-type="text/css" href="internet-watch.css"></item>
146
+ <item id="index" media-type="text/html" href="toc.html"></item>
147
+ XML
148
+
149
+ items.each do |item|
150
+ f.puts %Q|\t\t<item id="#{item_id item.uri}" media-type="text/html" href="#{item_id item.uri}.html"></item>|
151
+ end
152
+
153
+ f.write <<-XML.gsub( /^\t/, '' )
154
+ </manifest>
155
+ <spine toc="toc">
156
+ <itemref idref="index" />
157
+ XML
158
+
159
+ items.each do |item|
160
+ f.puts %Q|\t<itemref idref="#{item_id item.uri}" />\n|
161
+ end
162
+
163
+ f.write <<-XML.gsub( /^\t/, '' )
164
+ </spine>
165
+ <tours></tours>
166
+ <guide>
167
+ <reference type="toc" title="Table of Contents" href="toc.html"></reference>
168
+ <reference type="start" title="Table of Contents" href="toc.html"></reference>
169
+ </guide>
170
+ </package>
171
+ XML
172
+ end
173
+
174
+ yield "#{@dst_dir}/internet-watch.opf"
175
+ end
176
+
177
+ private
178
+
179
+ def retry_loop( times )
180
+ count = 0
181
+ begin
182
+ yield
183
+ rescue
184
+ count += 1
185
+ if count >= times
186
+ raise
187
+ else
188
+ News2Kindle.logger.error $!
189
+ News2Kindle.logger.info "#{count} retry."
190
+ sleep 1
191
+ retry
192
+ end
193
+ end
194
+ end
195
+
196
+ def item_id( uri )
197
+ File::basename( uri.path, '.html' )
198
+ end
199
+
200
+ def get_article( uri )
201
+ cache = "#{@src_dir}/#{File::basename uri.path}"
202
+ begin
203
+ html = open( cache, &:read )
204
+ rescue Errno::ENOENT
205
+ #puts "getting article: #{uri.path}".encode( Encoding::default_external )
206
+ html = retry_loop( 5 ) do
207
+ open( uri, &:read )
208
+ end
209
+ open( cache, 'w' ){|f| f.write html }
210
+ end
211
+ Nokogiri( html.encode 'UTF-8', invalid: :replace, undef: :replace, replace: '?' )
212
+ end
213
+
214
+ def html_header( title )
215
+ <<-HTML.gsub( /^\t/, '' )
216
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
217
+ <html>
218
+ <head>
219
+ <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></meta>
220
+ <title>#{title}</title>
221
+ <link rel="stylesheet" href="internet-watch.css" type="text/css" media="all"></link>
222
+ </head>
223
+ <body>
224
+ <h1>#{title}</h1>
225
+ HTML
226
+ end
227
+
228
+ def html_footer
229
+ <<-HTML.gsub( /^\t/, '' )
230
+ </body>
231
+ </html>
232
+ HTML
233
+ end
234
+ end
235
+ end
236
+ end
@@ -0,0 +1,18 @@
1
+ # scraping nikkei.com (for free user) for News2Kindle
2
+ #
3
+
4
+ require 'nokogiri'
5
+ require 'open-uri'
6
+ require 'tmpdir'
7
+ require 'pathname'
8
+ require (File.dirname(__FILE__) + '/nikkei-paid')
9
+
10
+ module News2Kindle
11
+ module Generator
12
+ class NikkeiFree < NikkeiPaid
13
+ def auth
14
+ return nil, nil
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,352 @@
1
+ # scraping nikkei.com (for paid user) for News2Kindle
2
+ #
3
+
4
+ require 'mechanize'
5
+ require 'nokogiri'
6
+ require 'open-uri'
7
+ require 'tmpdir'
8
+ require 'pathname'
9
+
10
+ module News2Kindle
11
+ module Generator
12
+ class NikkeiPaid
13
+ class IllegalPage < StandardError; end
14
+
15
+ TOP = 'https://www.nikkei.com'
16
+ LOGIN = "#{TOP}/etc/accounts/login?dps=3&amp;pageflag=top&amp;url=http%3A%2F%2Fwww.nikkei.com%2F"
17
+
18
+ def initialize( tmpdir )
19
+ @nikkei_id, @nikkei_pw = auth
20
+ @current_dir = tmpdir
21
+
22
+ @src_dir = @current_dir + '/src'
23
+ Dir::mkdir( @src_dir )
24
+
25
+ @dst_dir = @current_dir + '/dst'
26
+ Dir::mkdir( @dst_dir )
27
+ FileUtils.cp( "./resource/nikkei.jpg", @dst_dir )
28
+ FileUtils.cp( "./resource/nikkei.css", @dst_dir )
29
+ end
30
+
31
+ def generate(opts)
32
+ @now = opts[:now]
33
+ @now_str = @now.strftime '%Y-%m-%d %H:%M'
34
+
35
+ agent = Mechanize::new
36
+ agent.set_proxy( *ENV['HTTP_PROXY'].split( /:/ ) ) if ENV['HTTP_PROXY']
37
+
38
+ toc = []
39
+ if @nikkei_id and @nikkei_pw
40
+ agent.get('https://regist.nikkei.com/ds/etc/accounts/logout')
41
+ agent.get( LOGIN )
42
+ agent.page.form_with( :name => 'autoPostForm' ).submit
43
+ agent.page.form_with( :name => 'LA7010Form01' ) do |form|
44
+ form['LA7010Form01:LA7010Email'] = @nikkei_id
45
+ form['LA7010Form01:LA7010Password'] = @nikkei_pw
46
+ form.click_button
47
+ end
48
+ agent.page.forms.first.submit
49
+ else
50
+ agent.get( TOP )
51
+ end
52
+
53
+ #
54
+ # scraping top news
55
+ #
56
+ toc_top = ['TOP NEWS']
57
+ %w(first second third fourth).each do |category|
58
+ (agent.page / "div.nx-top_news_#{category} h3 a").each do |a|
59
+ uri = a.attr('href')
60
+ next if News2Kindle::DupChecker.dup?(uri)
61
+ toc_top << [canonical( a.text.strip ), uri]
62
+ end
63
+ end
64
+ toc << toc_top
65
+
66
+ #
67
+ # scraping all categories
68
+ #
69
+ (agent.page / 'div.cmnc-genre').each do |genre|
70
+ toc_cat = []
71
+ (genre / 'h4.cmnc-genre_title a.cmnc-title_text').each do |cat|
72
+ next if /local/ =~ cat.attr( 'href' )
73
+ toc_cat << cat.text
74
+ (genre / 'li a').each do |article|
75
+ uri = article.attr('href')
76
+ next if News2Kindle::DupChecker.dup?(uri)
77
+ toc_cat << [canonical( article.text ), uri]
78
+ end
79
+ end
80
+ toc << toc_cat
81
+ end
82
+
83
+ begin
84
+ generate_contents( toc, agent )
85
+ yield "#{@dst_dir}/#{basename}.opf"
86
+ end
87
+
88
+ if @nikkei_id and @nikkei_pw
89
+ agent.get('https://regist.nikkei.com/ds/etc/accounts/logout')
90
+ end
91
+ end
92
+
93
+ private
94
+
95
+ def auth
96
+ require 'pit'
97
+ login = Pit::get('news2kindle', require: {
98
+ nikkei_user: 'your ID of Nikkei.',
99
+ nikkei_pass: 'your Password of Nikkei.',
100
+ })
101
+ return login[:nikkei_user], login[:nikkei_pass]
102
+ end
103
+
104
+ def basename
105
+ self.class.to_s.sub(/.*:/, '').gsub(/([A-Z])/, '-\\1').sub(/^-/, '').downcase
106
+ end
107
+
108
+ def canonical( str )
109
+ str.gsub( /\uFF5E/, "\u301C" ) # for WAVE DASH problem
110
+ end
111
+
112
+ def retry_loop( times )
113
+ count = 0
114
+ begin
115
+ yield
116
+ rescue
117
+ count += 1
118
+ if count >= times
119
+ raise
120
+ else
121
+ News2Kindle.logger.error $!
122
+ News2Kindle.logger.info "#{count} retry."
123
+ retry
124
+ end
125
+ end
126
+ end
127
+
128
+ def html_header( title )
129
+ <<~HTML
130
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
131
+ <html>
132
+ <head>
133
+ <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></meta>
134
+ <title>#{title}</title>
135
+ <link rel="stylesheet" href="nikkei.css" type="text/css" media="all"></link>
136
+ </head>
137
+ <body>
138
+ <h1>#{title}</h1>
139
+ HTML
140
+ end
141
+
142
+ def get_html_item( agent, uri, sub = nil )
143
+ uri.sub!( %r|^https://www.nikkei.com|, '' )
144
+ aid = uri2aid( uri )
145
+ html = nil
146
+ if File::exist?( "#{@src_dir}/#{aid}#{sub}.html" ) # loading cache
147
+ html = Nokogiri( open( "#{@src_dir}/#{aid}#{sub}.html", 'r:utf-8', &:read ) )
148
+ else
149
+ begin
150
+ #puts "getting html #{aid}#{sub}"
151
+ retry_loop( 5 ) do
152
+ agent.get( "#{TOP}#{uri}" )
153
+ html = agent.page.root
154
+ sleep 1
155
+ end
156
+ rescue
157
+ News2Kindle.logger.error "cannot get #{TOP}#{uri}."
158
+ raise
159
+ end
160
+ open( "#{@src_dir}/#{aid}#{sub}.html", 'w:utf-8' ) do |f|
161
+ f.write( html.to_html )
162
+ end
163
+ end
164
+ html
165
+ end
166
+
167
+ def scrape_html_item( html )
168
+ result = ''
169
+ (html / 'div.cmn-article_text').each do |div|
170
+ div.children.each do |e|
171
+ #div.css('div.cmn-photo_style2 img', 'p', 'table').each do |e|
172
+ case e.name
173
+ when 'p'
174
+ next unless (e / 'a.cmnc-continue').empty?
175
+ (e / 'span.JSID_urlData').remove
176
+ para = canonical e.text.strip.sub( /^ /, '' )
177
+ result << "\t<p>#{para}</p>" unless para.empty?
178
+ when 'table'
179
+ result << e.to_html
180
+ when 'div'
181
+ e.css('img').each do |img|
182
+ image_url = img['src']
183
+ next if /^http/ =~ image_url # skip images in other server
184
+ next if /^\/\// =~ image_url # skip assets
185
+ image_file = File::basename( image_url )
186
+ begin
187
+ image = open( "#{TOP}#{image_url.sub /PN/, 'PB'}", &:read )
188
+ open( "#{@dst_dir}/#{image_file}", 'w' ){|fp| fp.write image}
189
+ result << %Q|\t<div>|
190
+ result << %Q|\t\t<img src="#{image_file}">|
191
+ result << %Q|\t\t<p>[#{e.text}]</p>| unless e.text.strip.empty?
192
+ result << %Q|\t</div>|
193
+ rescue
194
+ News2Kindle.logger.debug $!
195
+ News2Kindle.logger.warn "FAIL TO DOWNLOAD IMAGE: #{image_url}"
196
+ end
197
+ end
198
+ end
199
+ end
200
+ end
201
+ result
202
+ end
203
+
204
+ def html_item( item, uri, agent )
205
+ aid = uri2aid( uri )
206
+ return '' unless aid
207
+ html = get_html_item( agent, uri )
208
+ out_file = "#{@dst_dir}/#{aid}.html"
209
+
210
+ begin
211
+ open( out_file, 'w:utf-8' ) do |f|
212
+ f.puts canonical( html_header( (html / 'h1.cmn-article_title, h4.cmn-article_title, h2.cmn-article_title')[0].text.strip ) )
213
+ f.puts scrape_html_item( html )
214
+ (html / 'div.cmn-article_nation ul li a').map {|link|
215
+ link.attr( 'href' )
216
+ }.sort.uniq.each_with_index do |link,index|
217
+ f.puts scrape_html_item( get_html_item( agent, link, index + 2 ) )
218
+ end
219
+ f.puts html_footer
220
+ end
221
+
222
+ %Q|\t\t<li><a href="#{aid}.html">#{item}</a></li>|
223
+ rescue NoMethodError
224
+ News2Kindle.logger.debug $!
225
+ News2Kindle.logger.error "page parsing faild. #{aid}"
226
+ File.delete out_file
227
+ raise IllegalPage.new
228
+ end
229
+ end
230
+
231
+ def html_footer
232
+ <<~HTML
233
+ </body>
234
+ </html>
235
+ HTML
236
+ end
237
+
238
+ def ncx_header
239
+ <<~XML
240
+ <?xml version="1.0" encoding="UTF-8"?>
241
+ <!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
242
+ <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
243
+ <docTitle><text>日経電子版 (#{@now_str})</text></docTitle>
244
+ <navMap>
245
+ <navPoint id="toc" playOrder="0"><navLabel><text>Table of Contents</text></navLabel><content src="toc.html" /></navPoint>
246
+ XML
247
+ end
248
+
249
+ def ncx_item( item, uri, index )
250
+ aid = uri2aid( uri )
251
+ aid ? %Q|\t\t<navPoint id="#{aid}" playOrder="#{index}"><navLabel><text>#{item}</text></navLabel><content src="#{aid}.html" /></navPoint>| : ''
252
+ end
253
+
254
+ def ncx_footer
255
+ <<~XML
256
+ </navMap>
257
+ </ncx>
258
+ XML
259
+ end
260
+
261
+ def opf_header
262
+ <<~XML
263
+ <?xml version="1.0" encoding="utf-8"?>
264
+ <package unique-identifier="uid">
265
+ <metadata>
266
+ <dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/">
267
+ <dc:Title>日経電子版 (#{@now_str})</dc:Title>
268
+ <dc:Language>ja-JP</dc:Language>
269
+ <dc:Creator>日本経済新聞社</dc:Creator>
270
+ <dc:Description>日経電子版、#{@now_str}生成</dc:Description>
271
+ <dc:Date>#{@now.strftime( '%d/%m/%Y' )}</dc:Date>
272
+ </dc-metadata>
273
+ <x-metadata>
274
+ <output encoding="utf-8" content-type="text/x-oeb1-document"></output>
275
+ <EmbeddedCover>nikkei.jpg</EmbeddedCover>
276
+ </x-metadata>
277
+ </metadata>
278
+ <manifest>
279
+ <item id="toc" media-type="application/x-dtbncx+xml" href="toc.ncx"></item>
280
+ <item id="style" media-type="text/css" href="nikkei.css"></item>
281
+ <item id="index" media-type="text/html" href="toc.html"></item>
282
+ XML
283
+ end
284
+
285
+ def opf_item( uri )
286
+ aid = uri2aid( uri )
287
+ aid ? %Q|\t\t<item id="#{aid}" media-type="text/html" href="#{aid}.html"></item>| : ''
288
+ end
289
+
290
+ def opf_footer( aids )
291
+ items = aids.map{|aid| %Q|\t<itemref idref="#{aid}" />|}
292
+ <<~XML
293
+ </manifest>
294
+ <spine toc="toc">
295
+ #{items.join("\n")}
296
+ <itemref idref="index" />
297
+ </spine>
298
+ <tours></tours>
299
+ <guide>
300
+ <reference type="toc" title="Table of Contents" href="toc.html"></reference>
301
+ <reference type="start" title="Top Story" href="#{aids[0]}.html"></reference>
302
+ </guide>
303
+ </package>
304
+ XML
305
+ end
306
+
307
+ def uri2aid( uri )
308
+ uri.scan( %r|/article/([^/]*)/| ).flatten[0]
309
+ end
310
+
311
+ def generate_contents( toc, agent )
312
+ open( "#{@dst_dir}/toc.html", 'w:utf-8' ) do |html|
313
+ open( "#{@dst_dir}/toc.ncx", 'w:utf-8' ) do |ncx|
314
+ open( "#{@dst_dir}/#{basename}.opf", 'w:utf-8' ) do |opf|
315
+ first = true
316
+ toc_index = 0
317
+ aids = []
318
+ ncx.puts ncx_header
319
+ opf.puts opf_header
320
+ toc.each do |category|
321
+ category.each do |article|
322
+ if article.class == String
323
+ html.puts first ?
324
+ html_header( 'Table of Contents' ) :
325
+ "\t</ul>\n\t<mbp:pagebreak />"
326
+ html.puts "\t<h2>#{article}</h2>"
327
+ html.puts "\t<ul>"
328
+ first = false
329
+ else
330
+ begin
331
+ html.puts html_item( article[0], article[1], agent )
332
+ ncx.puts ncx_item( article[0], article[1], toc_index += 1 )
333
+ unless aids.index( uri2aid( article[1] ) )
334
+ opf.puts opf_item( article[1] )
335
+ aids << uri2aid( article[1] ) if uri2aid( article[1] )
336
+ end
337
+ rescue IllegalPage
338
+ end
339
+ end
340
+ end
341
+ end
342
+ html.puts "\t</ul>"
343
+ html.puts html_footer
344
+ ncx.puts ncx_footer
345
+ opf.puts opf_footer( aids )
346
+ end
347
+ end
348
+ end
349
+ end
350
+ end
351
+ end
352
+ end