news2kindle 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,135 @@
1
+ # scraping tDiary's N-Year diary for News2Kindle
2
+ #
3
+
4
+ require 'nokogiri'
5
+ require 'open-uri'
6
+ require 'uri'
7
+
8
+ module News2Kindle
9
+ module Generator
10
+ class Tdiary
11
+ def initialize( tmpdir )
12
+ @current_dir = tmpdir
13
+ FileUtils.cp( "./resource/tdiary.css", @current_dir )
14
+ end
15
+
16
+ def generate(opts)
17
+ now = opts[:now]
18
+ @top = opts[:tdiary_top] || ENV['TDIARY_TOP']
19
+
20
+ html = title = author = now_str = nil
21
+ begin
22
+ retry_loop( 5 ) do
23
+ html = Nokogiri(open("#{@top}?date=#{now.strftime '%m%d'}", 'r:utf-8', &:read))
24
+ title = (html / 'head title').text
25
+ author = (html / 'head meta[name="author"]')[0]['content']
26
+ now_str = now.strftime( '%m-%d' )
27
+ end
28
+ rescue => e
29
+ News2Kindle.logger.info "failed by retry over: #{e.class}: #{e}"
30
+ end
31
+
32
+ #
33
+ # generating html
34
+ #
35
+ html.css('head meta', 'head link', 'head style', 'script').remove
36
+ html.css('div.adminmenu', 'div.sidebar', 'div.footer').remove
37
+ (html / 'img').each do |img|
38
+ file_name = save_image(img['src'])
39
+ img['src'] = file_name
40
+ end
41
+ open( "#{@current_dir}/index.html", 'w' ){|f| f.write html.to_html}
42
+
43
+ #
44
+ # generating TOC in ncx
45
+ #
46
+ open( "#{@current_dir}/toc.ncx", 'w:utf-8' ) do |f|
47
+ f.write <<-XML.gsub( /^\t/, '' )
48
+ <?xml version="1.0" encoding="UTF-8"?>
49
+ <!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
50
+ <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
51
+ <docTitle><text>#{title}</text></docTitle>
52
+ <navMap>
53
+ <navPoint id="index" playOrder="1">
54
+ <navLabel>
55
+ <text>#{title}</text>
56
+ </navLabel>
57
+ <content src="index.html" />
58
+ </navPoint>
59
+ </navMap>
60
+ </ncx>
61
+ XML
62
+ end
63
+
64
+ #
65
+ # generating OPF
66
+ #
67
+ open( "#{@current_dir}/tdiary.opf", 'w:utf-8' ) do |f|
68
+ f.write <<-XML.gsub( /^\t/, '' )
69
+ <?xml version="1.0" encoding="utf-8"?>
70
+ <package unique-identifier="uid">
71
+ <metadata>
72
+ <dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/">
73
+ <dc:Title>#{title}</dc:Title>
74
+ <dc:Language>ja-JP</dc:Language>
75
+ <dc:Creator>#{author}</dc:Creator>
76
+ <dc:Description>tDiary N-Year Diary</dc:Description>
77
+ <dc:Date>#{now.strftime( '%d/%m/%Y' )}</dc:Date>
78
+ </dc-metadata>
79
+ </metadata>
80
+ <manifest>
81
+ <item id="toc" media-type="application/x-dtbncx+xml" href="toc.ncx"></item>
82
+ <item id="style" media-type="text/css" href="tdiary.css"></item>
83
+ <item id="index" media-type="text/html" href="index.html"></item>
84
+ </manifest>
85
+ <spine toc="toc">
86
+ <itemref idref="index" />
87
+ </spine>
88
+ <tours></tours>
89
+ <guide>
90
+ <reference type="start" title="Start Page" href="index.html"></reference>
91
+ </guide>
92
+ </package>
93
+ XML
94
+ end
95
+
96
+ yield "#{@current_dir}/tdiary.opf"
97
+ end
98
+
99
+ private
100
+
101
+ def retry_loop( times )
102
+ count = 0
103
+ begin
104
+ yield
105
+ rescue
106
+ count += 1
107
+ if count >= times
108
+ raise
109
+ else
110
+ News2Kindle.logger.debug $!
111
+ News2Kindle.logger.info "#{count} retry."
112
+ sleep 1
113
+ retry
114
+ end
115
+ end
116
+ end
117
+
118
+ def save_image(img)
119
+ require 'securerandom'
120
+
121
+ img = @top + img if /^https?:/ !~ img
122
+ uri = URI(img)
123
+ file_name = "#{SecureRandom.hex}#{uri.to_s.scan(/\.[^\.]+$/)[0]}"
124
+ begin
125
+ open("#{@current_dir}/#{file_name}", 'w') do |f|
126
+ f.write open(uri, &:read)
127
+ end
128
+ rescue OpenURI::HTTPError, RuntimeError, Errno::ENOENT
129
+ News2Kindle.logger.warn "#$!: #{uri}"
130
+ end
131
+ return file_name
132
+ end
133
+ end
134
+ end
135
+ end
@@ -0,0 +1,360 @@
1
+ # -*- coding: utf-8; -*-
2
+ #
3
+ # scraping jp.wsj.com for Kindlizer
4
+ #
5
+
6
+ require 'mechanize'
7
+ require 'nokogiri'
8
+ require 'open-uri'
9
+ require 'tmpdir'
10
+ require 'pathname'
11
+ require 'json'
12
+
13
+ module News2Kindle
14
+ module Generator
15
+ class WsjPaid
16
+ TOP = 'http://jp.wsj.com'
17
+ LOGIN = "https://id.wsj.com/access/pages/wsj/jp/login_standalone.html"
18
+
19
+ def initialize( tmpdir )
20
+ begin
21
+ require 'pit'
22
+ login = Pit::get( 'wsj', :require => {
23
+ 'user' => 'your ID of WSJ.',
24
+ 'pass' => 'your Password of WSJ.',
25
+ } )
26
+ @wsj_id = login['user']
27
+ @wsj_pw = login['pass']
28
+ rescue LoadError # no pit library, using environment variables
29
+ @wsj_id = ENV['WSJ_ID']
30
+ @wsj_pw = ENV['WSJ_PW']
31
+ end
32
+
33
+ @current_dir = tmpdir
34
+
35
+ @src_dir = @current_dir + '/src'
36
+ Dir::mkdir( @src_dir ) if(!File.exist?( @src_dir ))
37
+
38
+ @dst_dir = @current_dir + '/dst'
39
+ Dir::mkdir( @dst_dir ) if(!File.exist?( @dst_dir ))
40
+ FileUtils.cp( "./resource/wsj.jpg", @dst_dir )
41
+ FileUtils.cp( "./resource/wsj.css", @dst_dir )
42
+ end
43
+
44
+ def generate(opts)
45
+ @now = opts[:now]
46
+ @now_str = @now.strftime '%Y-%m-%d %H:%M'
47
+ @title = "WSJ日本版"
48
+ @lang = "ja-JP"
49
+
50
+ agent = Mechanize::new
51
+ agent.set_proxy( *ENV['HTTP_PROXY'].split( /:/ ) ) if ENV['HTTP_PROXY']
52
+
53
+ toc = []
54
+ toc_cat = []
55
+
56
+ agent.get(LOGIN)
57
+
58
+ form = agent.page.forms.first
59
+ form.action = ('https://id.wsj.com/auth/submitlogin.json')
60
+ form['username'] = @wsj_id
61
+ form['password'] = @wsj_pw
62
+ agent.page.forms.first.submit
63
+
64
+ response = JSON.parse(agent.page.body)
65
+ agent.get( response["url"] )
66
+
67
+ agent.get( TOP + "/home-page?_wsjregion=asia,jp&_homepage=/home/jp")
68
+
69
+ #
70
+ # scraping top news
71
+ #
72
+ toc_top = ['TOP NEWS']
73
+ (agent.page / "div.whatsNews ul.newsItem h2 a").each do |a|
74
+ if(a.attr('href') =~ /^http:\/\/jp.wsj.com\/article\//)
75
+ toc_top << [canonical( a.text.strip ), a.attr( 'href' )]
76
+ end
77
+ end
78
+ toc << toc_top
79
+
80
+ #
81
+ # scraping all categories
82
+ #
83
+ first = true
84
+ (agent.page.root / 'div.wsjMainNav li').each do |li|
85
+ if(first)
86
+ first = false
87
+ next
88
+ end
89
+
90
+ a = (li / 'a').first
91
+ toc_cat = []
92
+ toc_cat << canonical( a.text.strip )
93
+ begin
94
+ retry_loop( 5 ) do
95
+ agent.get(a.attr( 'href' ))
96
+ sleep 1
97
+ end
98
+ rescue
99
+ News2Kindle.logger.error "cannot get #{uri}."
100
+ raise
101
+ end
102
+
103
+ count = 0
104
+ (agent.page / "div.leadModule" ).remove
105
+ newsLinks = (agent.page / "div.headlineSummary ul.newsItem h2 a" )
106
+ newsLinks.each do |a|
107
+ if(a.attr('href') =~ /^http:\/\/jp.wsj.com\/article\//)
108
+ toc_cat << [canonical( a.text.strip ), a.attr( 'href' )]
109
+ count += 1
110
+ break if(count >= 10)
111
+ end
112
+ end
113
+ toc << toc_cat
114
+ end
115
+
116
+ begin
117
+ generate_contents( toc, agent )
118
+ yield "#{@dst_dir}/wsj-paid.opf"
119
+ end
120
+ end
121
+
122
+ private
123
+
124
+ def canonical( str )
125
+ str.gsub( /\uFF5E/, "\u301C" ) # for WAVE DASH problem
126
+ end
127
+
128
+ def retry_loop( times )
129
+ count = 0
130
+ begin
131
+ yield
132
+ rescue
133
+ count += 1
134
+ if count >= times
135
+ raise
136
+ else
137
+ News2Kindle.logger.debug $!
138
+ News2Kindle.logger.info "#{count} retry."
139
+ retry
140
+ end
141
+ end
142
+ end
143
+
144
+ def html_header( title )
145
+ <<-HTML.gsub( /^\t/, '' )
146
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
147
+ <html>
148
+ <head>
149
+ <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></meta>
150
+ <title>#{title}</title>
151
+ <link rel="stylesheet" href="wsj.css" type="text/css" media="all"></link>
152
+ </head>
153
+ <body>
154
+ <h1>#{title}</h1>
155
+ HTML
156
+ end
157
+
158
+ def get_html_item( agent, uri, sub = nil )
159
+ aid = uri2aid( uri )
160
+ html = nil
161
+ if File::exist?( "#{@src_dir}/#{aid}#{sub}.html" ) # loading cache
162
+ html = Nokogiri( open( "#{@src_dir}/#{aid}#{sub}.html", 'r:utf-8', &:read ) )
163
+ else
164
+ begin
165
+ #puts "getting html #{aid}#{sub}"
166
+ retry_loop( 5 ) do
167
+ agent.get( uri )
168
+ html = agent.page.root
169
+ sleep 1
170
+ end
171
+ rescue
172
+ News2Kindle.logger.error "cannot get #{uri}."
173
+ raise
174
+ end
175
+ open( "#{@src_dir}/#{aid}#{sub}.html", 'w:utf-8' ) do |f|
176
+ f.write( html.to_html )
177
+ end
178
+ end
179
+ html
180
+ end
181
+
182
+ def scrape_html_item( html )
183
+ contents = (html / 'div#article_story_body')
184
+
185
+ if(contents.size == 0)
186
+ contents = (html / 'div#slideContainer')
187
+ if(contents.size > 0)
188
+ (contents / 'div.dSlideViewer').before((contents / 'div.dSlideViewer li.firstSlide').inner_html)
189
+ (contents / 'div.dSlideViewer, h2.header, ul.nav-inline').remove
190
+ end
191
+ else
192
+ signature = (contents / 'ul.socialByline')
193
+ if(signature.size > 0)
194
+ signature[0].before(signature.inner_text)
195
+ signature.remove
196
+ end
197
+ (contents / 'div.insettipBox , div.insetButton').remove
198
+ (contents / 'div.insetZoomTargetBox a').remove
199
+ (contents / 'div.legacyInset div.embedType-interactive').each {|d| d.parent.remove}
200
+ end
201
+
202
+ (contents / 'img').each do |image_tag|
203
+ image_url = image_tag.attr( 'src' )
204
+ image_file = File::basename( image_url )
205
+ if(File.exist?("#{@dst_dir}/#{image_file}"))
206
+ image_tag.set_attribute("src", image_file)
207
+ next
208
+ end
209
+ begin
210
+ image = open( image_url, &:read )
211
+ open( "#{@dst_dir}/#{image_file}", 'w' ){|fp| fp.write image}
212
+ image_tag.set_attribute("src", image_file)
213
+ rescue
214
+ News2Kindle.logger.warn "FAIL TO DOWNLOAD IMAGE: #{image_url}"
215
+ end
216
+ end
217
+
218
+ contents.inner_html
219
+ end
220
+
221
+ def html_item( item, uri, agent )
222
+ aid = uri2aid( uri )
223
+ return '' unless aid
224
+ html = get_html_item( agent, uri )
225
+
226
+ open( "#{@dst_dir}/#{aid}.html", 'w:utf-8' ) do |f|
227
+ title_tag = (html / 'meta[@property="og:title"]')
228
+ title = title_tag.size > 0 ? title_tag[0].attr("content").strip : item
229
+ f.puts canonical( html_header( title ) )
230
+
231
+ f.puts scrape_html_item(html)
232
+ f.puts html_footer
233
+ end
234
+
235
+ %Q|\t\t<li><a href="#{aid}.html">#{item}</a></li>|
236
+ end
237
+
238
+ def html_footer
239
+ <<-HTML.gsub( /^\t/, '' )
240
+ </body>
241
+ </html>
242
+ HTML
243
+ end
244
+
245
+ def ncx_header
246
+ <<-XML.gsub( /^\t/, '' )
247
+ <?xml version="1.0" encoding="UTF-8"?>
248
+ <!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
249
+ <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
250
+ <docTitle><text>#{@title} (#{@now_str})</text></docTitle>
251
+ <navMap>
252
+ <navPoint id="toc" playOrder="0"><navLabel><text>Table of Contents</text></navLabel><content src="toc.html" /></navPoint>
253
+ XML
254
+ end
255
+
256
+ def ncx_item( item, uri, index )
257
+ aid = uri2aid( uri )
258
+ aid ? %Q|\t\t<navPoint id="#{aid}" playOrder="#{index}"><navLabel><text>#{item}</text></navLabel><content src="#{aid}.html" /></navPoint>| : ''
259
+ end
260
+
261
+ def ncx_footer
262
+ <<-XML.gsub( /^\t/, '' )
263
+ </navMap>
264
+ </ncx>
265
+ XML
266
+ end
267
+
268
+ def opf_header
269
+ <<-XML.gsub( /^\t/, '' )
270
+ <?xml version="1.0" encoding="utf-8"?>
271
+ <package unique-identifier="uid">
272
+ <metadata>
273
+ <dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/">
274
+ <dc:Title>#{@title} (#{@now_str})</dc:Title>
275
+ <dc:Language>#{@lang}</dc:Language>
276
+ <dc:Creator>The Wall Street Journal Online</dc:Creator>
277
+ <dc:Description>#{@title}、#{@now_str}生成</dc:Description>
278
+ <dc:Date>#{@now.strftime( '%d/%m/%Y' )}</dc:Date>
279
+ </dc-metadata>
280
+ <x-metadata>
281
+ <output encoding="utf-8" content-type="text/x-oeb1-document"></output>
282
+ <EmbeddedCover>wsj.jpg</EmbeddedCover>
283
+ </x-metadata>
284
+ </metadata>
285
+ <manifest>
286
+ <item id="toc" media-type="application/x-dtbncx+xml" href="toc.ncx"></item>
287
+ <item id="style" media-type="text/css" href="wsj.css"></item>
288
+ <item id="index" media-type="text/html" href="toc.html"></item>
289
+ XML
290
+ end
291
+
292
+ def opf_item( uri )
293
+ aid = uri2aid( uri )
294
+ aid ? %Q|\t\t<item id="#{aid}" media-type="text/html" href="#{aid}.html"></item>| : ''
295
+ end
296
+
297
+ def opf_footer( aids )
298
+ r = <<-XML.gsub( /^\t/, '' )
299
+ </manifest>
300
+ <spine toc="toc">
301
+ XML
302
+ aids.each do |aid|
303
+ r << %Q|\t<itemref idref="#{aid}" />\n|
304
+ end
305
+ r << <<-XML.gsub( /^\t/, '' )
306
+ <itemref idref="index" />
307
+ </spine>
308
+ <tours></tours>
309
+ <guide>
310
+ <reference type="toc" title="Table of Contents" href="toc.html"></reference>
311
+ <reference type="start" title="Top Story" href="#{aids[0]}.html"></reference>
312
+ </guide>
313
+ </package>
314
+ XML
315
+ r
316
+ end
317
+
318
+ def uri2aid( uri )
319
+ uri.scan( %r|/article/([^/]*).html| ).flatten[0]
320
+ end
321
+
322
+ def generate_contents( toc, agent )
323
+ open( "#{@dst_dir}/toc.html", 'w:utf-8' ) do |html|
324
+ open( "#{@dst_dir}/toc.ncx", 'w:utf-8' ) do |ncx|
325
+ open( "#{@dst_dir}/wsj-paid.opf", 'w:utf-8' ) do |opf|
326
+ first = true
327
+ toc_index = 0
328
+ aids = []
329
+ ncx.puts ncx_header
330
+ opf.puts opf_header
331
+ toc.each do |category|
332
+ category.each do |article|
333
+ if article.class == String
334
+ html.puts first ?
335
+ html_header( 'Table of Contents' ) :
336
+ "\t</ul>\n\t<mbp:pagebreak />"
337
+ html.puts "\t<h2>#{article}</h2>"
338
+ html.puts "\t<ul>"
339
+ first = false
340
+ else
341
+ html.puts html_item( article[0], article[1], agent )
342
+ ncx.puts ncx_item( article[0], article[1], toc_index += 1 )
343
+ unless aids.index( uri2aid( article[1] ) )
344
+ opf.puts opf_item( article[1] )
345
+ aids << uri2aid( article[1] ) if uri2aid( article[1] )
346
+ end
347
+ end
348
+ end
349
+ end
350
+ html.puts "\t</ul>"
351
+ html.puts html_footer
352
+ ncx.puts ncx_footer
353
+ opf.puts opf_footer( aids )
354
+ end
355
+ end
356
+ end
357
+ end
358
+ end
359
+ end
360
+ end