news2kindle 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,236 @@
1
+ # scraping internet.watch.impress.co.jp for News2Kindle
2
+ #
3
+
4
+ require 'nokogiri'
5
+ require 'open-uri'
6
+ require 'uri'
7
+ require 'ostruct'
8
+ require 'tmpdir'
9
+ require 'pathname'
10
+ require 'fileutils'
11
+
12
+ module News2Kindle
13
+ module Generator
14
+ class InternetWatch
15
+ TOP = 'https://internet.watch.impress.co.jp'
16
+
17
+ def initialize( tmpdir )
18
+ @current_dir = tmpdir
19
+
20
+ @src_dir = @current_dir + '/src'
21
+ Dir::mkdir( @src_dir )
22
+
23
+ @dst_dir = @current_dir + '/dst'
24
+ Dir::mkdir( @dst_dir )
25
+ FileUtils.cp( "./resource/internet-watch.jpg", @dst_dir )
26
+ FileUtils.cp( "./resource/internet-watch.css", @dst_dir )
27
+ end
28
+
29
+ def generate(opts)
30
+ now = opts[:now]
31
+ items = []
32
+
33
+ rdf_file = "http://rss.rssad.jp/rss/internetwatch/internet.rdf"
34
+ rdf = retry_loop( 5 ) do
35
+ Nokogiri(open(rdf_file, 'r:utf-8', &:read))
36
+ end
37
+ (rdf / 'item' ).each do |item|
38
+ uri = URI( item.attr( 'rdf:about' ).to_s )
39
+ next unless /internet\.watch\.impress\.co\.jp/ =~ uri.host
40
+ uri.query = nil # remove query of 'ref=rss'
41
+ next if News2Kindle::DupChecker.dup?(uri)
42
+
43
+ title = (item / 'title').text
44
+ date = item.elements.map{|e| e.text if e.name == 'date'}.join
45
+ items << OpenStruct::new( :uri => uri, :title => title, :date => date )
46
+ end
47
+ items.sort!{|a,b| a.date <=> b.date}
48
+
49
+ now_str = now.strftime( '%Y-%m-%d %H:%M' )
50
+
51
+ #
52
+ # generating articles in html
53
+ #
54
+ items.each do |item|
55
+ begin
56
+ article = get_article( item.uri )
57
+ open( "#{@dst_dir}/#{item_id item.uri}.html", 'w' ) do |f|
58
+ f.puts html_header( item.title )
59
+ contents = (article / 'div.mainContents')
60
+ (contents / 'img').each do |img|
61
+ org = img.attr('ajax') || img.attr('src')
62
+ next if org =~ /^http/ # skip images on other servers
63
+ begin
64
+ img_file = retry_loop( 5 ) do
65
+ open( "#{TOP}#{org}", &:read )
66
+ end
67
+ cache = "#{org.gsub( /\//, '_' ).sub( /^_/, '' )}"
68
+ open( "#{@dst_dir}/#{cache}", 'w' ){|f| f.write img_file}
69
+ img.set_attribute( 'src', cache )
70
+ rescue OpenURI::HTTPError
71
+ News2Kindle.logger.error "skipped an image: #{TOP}#{org}"
72
+ end
73
+ end
74
+ f.puts contents.inner_html
75
+ f.puts html_footer
76
+ end
77
+ rescue
78
+ News2Kindle.logger.warn "#{$!.class}: #$!"
79
+ News2Kindle.logger.warn "skipped an article: #{item.uri}"
80
+ end
81
+ end
82
+
83
+ #
84
+ # generating TOC in html
85
+ #
86
+ open( "#{@dst_dir}/toc.html", 'w:utf-8' ) do |f|
87
+ f.write html_header( 'Table of Contents' )
88
+ if items.size == 0
89
+ f.puts %Q|<p>本日は記事がありません。</p>|
90
+ else
91
+ f.puts "<ul>"
92
+ items.each do |item|
93
+ f.puts %Q|\t<li><a href="#{item_id item.uri}.html">#{item.title}</a></li>|
94
+ end
95
+ f.puts "</ul>"
96
+ end
97
+ f.write html_footer
98
+ end
99
+
100
+ #
101
+ # generating TOC in ncx
102
+ #
103
+ open( "#{@dst_dir}/toc.ncx", 'w:utf-8' ) do |f|
104
+ f.write <<-XML.gsub( /^\t/, '' )
105
+ <?xml version="1.0" encoding="UTF-8"?>
106
+ <!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
107
+ <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
108
+ <docTitle><text>INTERNET Watch (#{now_str})</text></docTitle>
109
+ <navMap>
110
+ <navPoint id="toc" playOrder="0"><navLabel><text>Table of Contents</text></navLabel><content src="toc.html" /></navPoint>
111
+ XML
112
+
113
+ items.each_with_index do |item, index|
114
+ f.puts %Q|\t\t<navPoint id="#{item_id item.uri}" playOrder="#{index}"><navLabel><text>#{item.title}</text></navLabel><content src="#{item_id item.uri}.html" /></navPoint>|
115
+ end
116
+
117
+ f.write <<-XML.gsub( /^\t/, '' )
118
+ </navMap>
119
+ </ncx>
120
+ XML
121
+ end
122
+
123
+ #
124
+ # generating OPF
125
+ #
126
+ open( "#{@dst_dir}/internet-watch.opf", 'w:utf-8' ) do |f|
127
+ f.write <<-XML.gsub( /^\t/, '' )
128
+ <?xml version="1.0" encoding="utf-8"?>
129
+ <package unique-identifier="uid">
130
+ <metadata>
131
+ <dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/">
132
+ <dc:Title>INTERNET Watch (#{now_str})</dc:Title>
133
+ <dc:Language>ja-JP</dc:Language>
134
+ <dc:Creator>インプレス</dc:Creator>
135
+ <dc:Description>INTERNET Watch、#{now_str}生成</dc:Description>
136
+ <dc:Date>#{now.strftime( '%d/%m/%Y' )}</dc:Date>
137
+ </dc-metadata>
138
+ <x-metadata>
139
+ <output encoding="utf-8" content-type="text/x-oeb1-document"></output>
140
+ <EmbeddedCover>internet-watch.jpg</EmbeddedCover>
141
+ </x-metadata>
142
+ </metadata>
143
+ <manifest>
144
+ <item id="toc" media-type="application/x-dtbncx+xml" href="toc.ncx"></item>
145
+ <item id="style" media-type="text/css" href="internet-watch.css"></item>
146
+ <item id="index" media-type="text/html" href="toc.html"></item>
147
+ XML
148
+
149
+ items.each do |item|
150
+ f.puts %Q|\t\t<item id="#{item_id item.uri}" media-type="text/html" href="#{item_id item.uri}.html"></item>|
151
+ end
152
+
153
+ f.write <<-XML.gsub( /^\t/, '' )
154
+ </manifest>
155
+ <spine toc="toc">
156
+ <itemref idref="index" />
157
+ XML
158
+
159
+ items.each do |item|
160
+ f.puts %Q|\t<itemref idref="#{item_id item.uri}" />\n|
161
+ end
162
+
163
+ f.write <<-XML.gsub( /^\t/, '' )
164
+ </spine>
165
+ <tours></tours>
166
+ <guide>
167
+ <reference type="toc" title="Table of Contents" href="toc.html"></reference>
168
+ <reference type="start" title="Table of Contents" href="toc.html"></reference>
169
+ </guide>
170
+ </package>
171
+ XML
172
+ end
173
+
174
+ yield "#{@dst_dir}/internet-watch.opf"
175
+ end
176
+
177
+ private
178
+
179
+ def retry_loop( times )
180
+ count = 0
181
+ begin
182
+ yield
183
+ rescue
184
+ count += 1
185
+ if count >= times
186
+ raise
187
+ else
188
+ News2Kindle.logger.error $!
189
+ News2Kindle.logger.info "#{count} retry."
190
+ sleep 1
191
+ retry
192
+ end
193
+ end
194
+ end
195
+
196
+ def item_id( uri )
197
+ File::basename( uri.path, '.html' )
198
+ end
199
+
200
+ def get_article( uri )
201
+ cache = "#{@src_dir}/#{File::basename uri.path}"
202
+ begin
203
+ html = open( cache, &:read )
204
+ rescue Errno::ENOENT
205
+ #puts "getting article: #{uri.path}".encode( Encoding::default_external )
206
+ html = retry_loop( 5 ) do
207
+ open( uri, &:read )
208
+ end
209
+ open( cache, 'w' ){|f| f.write html }
210
+ end
211
+ Nokogiri( html.encode 'UTF-8', invalid: :replace, undef: :replace, replace: '?' )
212
+ end
213
+
214
+ def html_header( title )
215
+ <<-HTML.gsub( /^\t/, '' )
216
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
217
+ <html>
218
+ <head>
219
+ <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></meta>
220
+ <title>#{title}</title>
221
+ <link rel="stylesheet" href="internet-watch.css" type="text/css" media="all"></link>
222
+ </head>
223
+ <body>
224
+ <h1>#{title}</h1>
225
+ HTML
226
+ end
227
+
228
+ def html_footer
229
+ <<-HTML.gsub( /^\t/, '' )
230
+ </body>
231
+ </html>
232
+ HTML
233
+ end
234
+ end
235
+ end
236
+ end
@@ -0,0 +1,18 @@
1
+ # scraping nikkei.com (for free user) for News2Kindle
2
+ #
3
+
4
+ require 'nokogiri'
5
+ require 'open-uri'
6
+ require 'tmpdir'
7
+ require 'pathname'
8
+ require (File.dirname(__FILE__) + '/nikkei-paid')
9
+
10
+ module News2Kindle
11
+ module Generator
12
+ class NikkeiFree < NikkeiPaid
13
+ def auth
14
+ return nil, nil
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,352 @@
1
+ # scraping nikkei.com (for paid user) for News2Kindle
2
+ #
3
+
4
+ require 'mechanize'
5
+ require 'nokogiri'
6
+ require 'open-uri'
7
+ require 'tmpdir'
8
+ require 'pathname'
9
+
10
+ module News2Kindle
11
+ module Generator
12
+ class NikkeiPaid
13
+ class IllegalPage < StandardError; end
14
+
15
+ TOP = 'https://www.nikkei.com'
16
+ LOGIN = "#{TOP}/etc/accounts/login?dps=3&amp;pageflag=top&amp;url=http%3A%2F%2Fwww.nikkei.com%2F"
17
+
18
+ def initialize( tmpdir )
19
+ @nikkei_id, @nikkei_pw = auth
20
+ @current_dir = tmpdir
21
+
22
+ @src_dir = @current_dir + '/src'
23
+ Dir::mkdir( @src_dir )
24
+
25
+ @dst_dir = @current_dir + '/dst'
26
+ Dir::mkdir( @dst_dir )
27
+ FileUtils.cp( "./resource/nikkei.jpg", @dst_dir )
28
+ FileUtils.cp( "./resource/nikkei.css", @dst_dir )
29
+ end
30
+
31
+ def generate(opts)
32
+ @now = opts[:now]
33
+ @now_str = @now.strftime '%Y-%m-%d %H:%M'
34
+
35
+ agent = Mechanize::new
36
+ agent.set_proxy( *ENV['HTTP_PROXY'].split( /:/ ) ) if ENV['HTTP_PROXY']
37
+
38
+ toc = []
39
+ if @nikkei_id and @nikkei_pw
40
+ agent.get('https://regist.nikkei.com/ds/etc/accounts/logout')
41
+ agent.get( LOGIN )
42
+ agent.page.form_with( :name => 'autoPostForm' ).submit
43
+ agent.page.form_with( :name => 'LA7010Form01' ) do |form|
44
+ form['LA7010Form01:LA7010Email'] = @nikkei_id
45
+ form['LA7010Form01:LA7010Password'] = @nikkei_pw
46
+ form.click_button
47
+ end
48
+ agent.page.forms.first.submit
49
+ else
50
+ agent.get( TOP )
51
+ end
52
+
53
+ #
54
+ # scraping top news
55
+ #
56
+ toc_top = ['TOP NEWS']
57
+ %w(first second third fourth).each do |category|
58
+ (agent.page / "div.nx-top_news_#{category} h3 a").each do |a|
59
+ uri = a.attr('href')
60
+ next if News2Kindle::DupChecker.dup?(uri)
61
+ toc_top << [canonical( a.text.strip ), uri]
62
+ end
63
+ end
64
+ toc << toc_top
65
+
66
+ #
67
+ # scraping all categories
68
+ #
69
+ (agent.page / 'div.cmnc-genre').each do |genre|
70
+ toc_cat = []
71
+ (genre / 'h4.cmnc-genre_title a.cmnc-title_text').each do |cat|
72
+ next if /local/ =~ cat.attr( 'href' )
73
+ toc_cat << cat.text
74
+ (genre / 'li a').each do |article|
75
+ uri = article.attr('href')
76
+ next if News2Kindle::DupChecker.dup?(uri)
77
+ toc_cat << [canonical( article.text ), uri]
78
+ end
79
+ end
80
+ toc << toc_cat
81
+ end
82
+
83
+ begin
84
+ generate_contents( toc, agent )
85
+ yield "#{@dst_dir}/#{basename}.opf"
86
+ end
87
+
88
+ if @nikkei_id and @nikkei_pw
89
+ agent.get('https://regist.nikkei.com/ds/etc/accounts/logout')
90
+ end
91
+ end
92
+
93
+ private
94
+
95
+ def auth
96
+ require 'pit'
97
+ login = Pit::get('news2kindle', require: {
98
+ nikkei_user: 'your ID of Nikkei.',
99
+ nikkei_pass: 'your Password of Nikkei.',
100
+ })
101
+ return login[:nikkei_user], login[:nikkei_pass]
102
+ end
103
+
104
+ def basename
105
+ self.class.to_s.sub(/.*:/, '').gsub(/([A-Z])/, '-\\1').sub(/^-/, '').downcase
106
+ end
107
+
108
+ def canonical( str )
109
+ str.gsub( /\uFF5E/, "\u301C" ) # for WAVE DASH problem
110
+ end
111
+
112
+ def retry_loop( times )
113
+ count = 0
114
+ begin
115
+ yield
116
+ rescue
117
+ count += 1
118
+ if count >= times
119
+ raise
120
+ else
121
+ News2Kindle.logger.error $!
122
+ News2Kindle.logger.info "#{count} retry."
123
+ retry
124
+ end
125
+ end
126
+ end
127
+
128
+ def html_header( title )
129
+ <<~HTML
130
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
131
+ <html>
132
+ <head>
133
+ <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></meta>
134
+ <title>#{title}</title>
135
+ <link rel="stylesheet" href="nikkei.css" type="text/css" media="all"></link>
136
+ </head>
137
+ <body>
138
+ <h1>#{title}</h1>
139
+ HTML
140
+ end
141
+
142
+ def get_html_item( agent, uri, sub = nil )
143
+ uri.sub!( %r|^https://www.nikkei.com|, '' )
144
+ aid = uri2aid( uri )
145
+ html = nil
146
+ if File::exist?( "#{@src_dir}/#{aid}#{sub}.html" ) # loading cache
147
+ html = Nokogiri( open( "#{@src_dir}/#{aid}#{sub}.html", 'r:utf-8', &:read ) )
148
+ else
149
+ begin
150
+ #puts "getting html #{aid}#{sub}"
151
+ retry_loop( 5 ) do
152
+ agent.get( "#{TOP}#{uri}" )
153
+ html = agent.page.root
154
+ sleep 1
155
+ end
156
+ rescue
157
+ News2Kindle.logger.error "cannot get #{TOP}#{uri}."
158
+ raise
159
+ end
160
+ open( "#{@src_dir}/#{aid}#{sub}.html", 'w:utf-8' ) do |f|
161
+ f.write( html.to_html )
162
+ end
163
+ end
164
+ html
165
+ end
166
+
167
+ def scrape_html_item( html )
168
+ result = ''
169
+ (html / 'div.cmn-article_text').each do |div|
170
+ div.children.each do |e|
171
+ #div.css('div.cmn-photo_style2 img', 'p', 'table').each do |e|
172
+ case e.name
173
+ when 'p'
174
+ next unless (e / 'a.cmnc-continue').empty?
175
+ (e / 'span.JSID_urlData').remove
176
+ para = canonical e.text.strip.sub( /^ /, '' )
177
+ result << "\t<p>#{para}</p>" unless para.empty?
178
+ when 'table'
179
+ result << e.to_html
180
+ when 'div'
181
+ e.css('img').each do |img|
182
+ image_url = img['src']
183
+ next if /^http/ =~ image_url # skip images in other server
184
+ next if /^\/\// =~ image_url # skip assets
185
+ image_file = File::basename( image_url )
186
+ begin
187
+ image = open( "#{TOP}#{image_url.sub /PN/, 'PB'}", &:read )
188
+ open( "#{@dst_dir}/#{image_file}", 'w' ){|fp| fp.write image}
189
+ result << %Q|\t<div>|
190
+ result << %Q|\t\t<img src="#{image_file}">|
191
+ result << %Q|\t\t<p>[#{e.text}]</p>| unless e.text.strip.empty?
192
+ result << %Q|\t</div>|
193
+ rescue
194
+ News2Kindle.logger.debug $!
195
+ News2Kindle.logger.warn "FAIL TO DOWNLOAD IMAGE: #{image_url}"
196
+ end
197
+ end
198
+ end
199
+ end
200
+ end
201
+ result
202
+ end
203
+
204
+ def html_item( item, uri, agent )
205
+ aid = uri2aid( uri )
206
+ return '' unless aid
207
+ html = get_html_item( agent, uri )
208
+ out_file = "#{@dst_dir}/#{aid}.html"
209
+
210
+ begin
211
+ open( out_file, 'w:utf-8' ) do |f|
212
+ f.puts canonical( html_header( (html / 'h1.cmn-article_title, h4.cmn-article_title, h2.cmn-article_title')[0].text.strip ) )
213
+ f.puts scrape_html_item( html )
214
+ (html / 'div.cmn-article_nation ul li a').map {|link|
215
+ link.attr( 'href' )
216
+ }.sort.uniq.each_with_index do |link,index|
217
+ f.puts scrape_html_item( get_html_item( agent, link, index + 2 ) )
218
+ end
219
+ f.puts html_footer
220
+ end
221
+
222
+ %Q|\t\t<li><a href="#{aid}.html">#{item}</a></li>|
223
+ rescue NoMethodError
224
+ News2Kindle.logger.debug $!
225
+ News2Kindle.logger.error "page parsing faild. #{aid}"
226
+ File.delete out_file
227
+ raise IllegalPage.new
228
+ end
229
+ end
230
+
231
+ def html_footer
232
+ <<~HTML
233
+ </body>
234
+ </html>
235
+ HTML
236
+ end
237
+
238
+ def ncx_header
239
+ <<~XML
240
+ <?xml version="1.0" encoding="UTF-8"?>
241
+ <!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
242
+ <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
243
+ <docTitle><text>日経電子版 (#{@now_str})</text></docTitle>
244
+ <navMap>
245
+ <navPoint id="toc" playOrder="0"><navLabel><text>Table of Contents</text></navLabel><content src="toc.html" /></navPoint>
246
+ XML
247
+ end
248
+
249
+ def ncx_item( item, uri, index )
250
+ aid = uri2aid( uri )
251
+ aid ? %Q|\t\t<navPoint id="#{aid}" playOrder="#{index}"><navLabel><text>#{item}</text></navLabel><content src="#{aid}.html" /></navPoint>| : ''
252
+ end
253
+
254
+ def ncx_footer
255
+ <<~XML
256
+ </navMap>
257
+ </ncx>
258
+ XML
259
+ end
260
+
261
+ def opf_header
262
+ <<~XML
263
+ <?xml version="1.0" encoding="utf-8"?>
264
+ <package unique-identifier="uid">
265
+ <metadata>
266
+ <dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/">
267
+ <dc:Title>日経電子版 (#{@now_str})</dc:Title>
268
+ <dc:Language>ja-JP</dc:Language>
269
+ <dc:Creator>日本経済新聞社</dc:Creator>
270
+ <dc:Description>日経電子版、#{@now_str}生成</dc:Description>
271
+ <dc:Date>#{@now.strftime( '%d/%m/%Y' )}</dc:Date>
272
+ </dc-metadata>
273
+ <x-metadata>
274
+ <output encoding="utf-8" content-type="text/x-oeb1-document"></output>
275
+ <EmbeddedCover>nikkei.jpg</EmbeddedCover>
276
+ </x-metadata>
277
+ </metadata>
278
+ <manifest>
279
+ <item id="toc" media-type="application/x-dtbncx+xml" href="toc.ncx"></item>
280
+ <item id="style" media-type="text/css" href="nikkei.css"></item>
281
+ <item id="index" media-type="text/html" href="toc.html"></item>
282
+ XML
283
+ end
284
+
285
+ def opf_item( uri )
286
+ aid = uri2aid( uri )
287
+ aid ? %Q|\t\t<item id="#{aid}" media-type="text/html" href="#{aid}.html"></item>| : ''
288
+ end
289
+
290
+ def opf_footer( aids )
291
+ items = aids.map{|aid| %Q|\t<itemref idref="#{aid}" />|}
292
+ <<~XML
293
+ </manifest>
294
+ <spine toc="toc">
295
+ #{items.join("\n")}
296
+ <itemref idref="index" />
297
+ </spine>
298
+ <tours></tours>
299
+ <guide>
300
+ <reference type="toc" title="Table of Contents" href="toc.html"></reference>
301
+ <reference type="start" title="Top Story" href="#{aids[0]}.html"></reference>
302
+ </guide>
303
+ </package>
304
+ XML
305
+ end
306
+
307
+ def uri2aid( uri )
308
+ uri.scan( %r|/article/([^/]*)/| ).flatten[0]
309
+ end
310
+
311
+ def generate_contents( toc, agent )
312
+ open( "#{@dst_dir}/toc.html", 'w:utf-8' ) do |html|
313
+ open( "#{@dst_dir}/toc.ncx", 'w:utf-8' ) do |ncx|
314
+ open( "#{@dst_dir}/#{basename}.opf", 'w:utf-8' ) do |opf|
315
+ first = true
316
+ toc_index = 0
317
+ aids = []
318
+ ncx.puts ncx_header
319
+ opf.puts opf_header
320
+ toc.each do |category|
321
+ category.each do |article|
322
+ if article.class == String
323
+ html.puts first ?
324
+ html_header( 'Table of Contents' ) :
325
+ "\t</ul>\n\t<mbp:pagebreak />"
326
+ html.puts "\t<h2>#{article}</h2>"
327
+ html.puts "\t<ul>"
328
+ first = false
329
+ else
330
+ begin
331
+ html.puts html_item( article[0], article[1], agent )
332
+ ncx.puts ncx_item( article[0], article[1], toc_index += 1 )
333
+ unless aids.index( uri2aid( article[1] ) )
334
+ opf.puts opf_item( article[1] )
335
+ aids << uri2aid( article[1] ) if uri2aid( article[1] )
336
+ end
337
+ rescue IllegalPage
338
+ end
339
+ end
340
+ end
341
+ end
342
+ html.puts "\t</ul>"
343
+ html.puts html_footer
344
+ ncx.puts ncx_footer
345
+ opf.puts opf_footer( aids )
346
+ end
347
+ end
348
+ end
349
+ end
350
+ end
351
+ end
352
+ end