news2kindle 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,135 @@
1
+ # scraping tDiary's N-Year diary for News2Kindle
2
+ #
3
+
4
+ require 'nokogiri'
5
+ require 'open-uri'
6
+ require 'uri'
7
+
8
+ module News2Kindle
9
+ module Generator
10
+ class Tdiary
11
+ def initialize( tmpdir )
12
+ @current_dir = tmpdir
13
+ FileUtils.cp( "./resource/tdiary.css", @current_dir )
14
+ end
15
+
16
+ def generate(opts)
17
+ now = opts[:now]
18
+ @top = opts[:tdiary_top] || ENV['TDIARY_TOP']
19
+
20
+ html = title = author = now_str = nil
21
+ begin
22
+ retry_loop( 5 ) do
23
+ html = Nokogiri(open("#{@top}?date=#{now.strftime '%m%d'}", 'r:utf-8', &:read))
24
+ title = (html / 'head title').text
25
+ author = (html / 'head meta[name="author"]')[0]['content']
26
+ now_str = now.strftime( '%m-%d' )
27
+ end
28
+ rescue => e
29
+ News2Kindle.logger.info "failed by retry over: #{e.class}: #{e}"
30
+ end
31
+
32
+ #
33
+ # generating html
34
+ #
35
+ html.css('head meta', 'head link', 'head style', 'script').remove
36
+ html.css('div.adminmenu', 'div.sidebar', 'div.footer').remove
37
+ (html / 'img').each do |img|
38
+ file_name = save_image(img['src'])
39
+ img['src'] = file_name
40
+ end
41
+ open( "#{@current_dir}/index.html", 'w' ){|f| f.write html.to_html}
42
+
43
+ #
44
+ # generating TOC in ncx
45
+ #
46
+ open( "#{@current_dir}/toc.ncx", 'w:utf-8' ) do |f|
47
+ f.write <<-XML.gsub( /^\t/, '' )
48
+ <?xml version="1.0" encoding="UTF-8"?>
49
+ <!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
50
+ <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
51
+ <docTitle><text>#{title}</text></docTitle>
52
+ <navMap>
53
+ <navPoint id="index" playOrder="1">
54
+ <navLabel>
55
+ <text>#{title}</text>
56
+ </navLabel>
57
+ <content src="index.html" />
58
+ </navPoint>
59
+ </navMap>
60
+ </ncx>
61
+ XML
62
+ end
63
+
64
+ #
65
+ # generating OPF
66
+ #
67
+ open( "#{@current_dir}/tdiary.opf", 'w:utf-8' ) do |f|
68
+ f.write <<-XML.gsub( /^\t/, '' )
69
+ <?xml version="1.0" encoding="utf-8"?>
70
+ <package unique-identifier="uid">
71
+ <metadata>
72
+ <dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/">
73
+ <dc:Title>#{title}</dc:Title>
74
+ <dc:Language>ja-JP</dc:Language>
75
+ <dc:Creator>#{author}</dc:Creator>
76
+ <dc:Description>tDiary N-Year Diary</dc:Description>
77
+ <dc:Date>#{now.strftime( '%d/%m/%Y' )}</dc:Date>
78
+ </dc-metadata>
79
+ </metadata>
80
+ <manifest>
81
+ <item id="toc" media-type="application/x-dtbncx+xml" href="toc.ncx"></item>
82
+ <item id="style" media-type="text/css" href="tdiary.css"></item>
83
+ <item id="index" media-type="text/html" href="index.html"></item>
84
+ </manifest>
85
+ <spine toc="toc">
86
+ <itemref idref="index" />
87
+ </spine>
88
+ <tours></tours>
89
+ <guide>
90
+ <reference type="start" title="Start Page" href="index.html"></reference>
91
+ </guide>
92
+ </package>
93
+ XML
94
+ end
95
+
96
+ yield "#{@current_dir}/tdiary.opf"
97
+ end
98
+
99
+ private
100
+
101
+ def retry_loop( times )
102
+ count = 0
103
+ begin
104
+ yield
105
+ rescue
106
+ count += 1
107
+ if count >= times
108
+ raise
109
+ else
110
+ News2Kindle.logger.debug $!
111
+ News2Kindle.logger.info "#{count} retry."
112
+ sleep 1
113
+ retry
114
+ end
115
+ end
116
+ end
117
+
118
+ def save_image(img)
119
+ require 'securerandom'
120
+
121
+ img = @top + img if /^https?:/ !~ img
122
+ uri = URI(img)
123
+ file_name = "#{SecureRandom.hex}#{uri.to_s.scan(/\.[^\.]+$/)[0]}"
124
+ begin
125
+ open("#{@current_dir}/#{file_name}", 'w') do |f|
126
+ f.write open(uri, &:read)
127
+ end
128
+ rescue OpenURI::HTTPError, RuntimeError, Errno::ENOENT
129
+ News2Kindle.logger.warn "#$!: #{uri}"
130
+ end
131
+ return file_name
132
+ end
133
+ end
134
+ end
135
+ end
@@ -0,0 +1,360 @@
1
+ # -*- coding: utf-8; -*-
2
+ #
3
+ # scraping jp.wsj.com for Kindlizer
4
+ #
5
+
6
+ require 'mechanize'
7
+ require 'nokogiri'
8
+ require 'open-uri'
9
+ require 'tmpdir'
10
+ require 'pathname'
11
+ require 'json'
12
+
13
+ module News2Kindle
14
+ module Generator
15
+ class WsjPaid
16
+ TOP = 'http://jp.wsj.com'
17
+ LOGIN = "https://id.wsj.com/access/pages/wsj/jp/login_standalone.html"
18
+
19
+ def initialize( tmpdir )
20
+ begin
21
+ require 'pit'
22
+ login = Pit::get( 'wsj', :require => {
23
+ 'user' => 'your ID of WSJ.',
24
+ 'pass' => 'your Password of WSJ.',
25
+ } )
26
+ @wsj_id = login['user']
27
+ @wsj_pw = login['pass']
28
+ rescue LoadError # no pit library, using environment variables
29
+ @wsj_id = ENV['WSJ_ID']
30
+ @wsj_pw = ENV['WSJ_PW']
31
+ end
32
+
33
+ @current_dir = tmpdir
34
+
35
+ @src_dir = @current_dir + '/src'
36
+ Dir::mkdir( @src_dir ) if(!File.exist?( @src_dir ))
37
+
38
+ @dst_dir = @current_dir + '/dst'
39
+ Dir::mkdir( @dst_dir ) if(!File.exist?( @dst_dir ))
40
+ FileUtils.cp( "./resource/wsj.jpg", @dst_dir )
41
+ FileUtils.cp( "./resource/wsj.css", @dst_dir )
42
+ end
43
+
44
+ def generate(opts)
45
+ @now = opts[:now]
46
+ @now_str = @now.strftime '%Y-%m-%d %H:%M'
47
+ @title = "WSJ日本版"
48
+ @lang = "ja-JP"
49
+
50
+ agent = Mechanize::new
51
+ agent.set_proxy( *ENV['HTTP_PROXY'].split( /:/ ) ) if ENV['HTTP_PROXY']
52
+
53
+ toc = []
54
+ toc_cat = []
55
+
56
+ agent.get(LOGIN)
57
+
58
+ form = agent.page.forms.first
59
+ form.action = ('https://id.wsj.com/auth/submitlogin.json')
60
+ form['username'] = @wsj_id
61
+ form['password'] = @wsj_pw
62
+ agent.page.forms.first.submit
63
+
64
+ response = JSON.parse(agent.page.body)
65
+ agent.get( response["url"] )
66
+
67
+ agent.get( TOP + "/home-page?_wsjregion=asia,jp&_homepage=/home/jp")
68
+
69
+ #
70
+ # scraping top news
71
+ #
72
+ toc_top = ['TOP NEWS']
73
+ (agent.page / "div.whatsNews ul.newsItem h2 a").each do |a|
74
+ if(a.attr('href') =~ /^http:\/\/jp.wsj.com\/article\//)
75
+ toc_top << [canonical( a.text.strip ), a.attr( 'href' )]
76
+ end
77
+ end
78
+ toc << toc_top
79
+
80
+ #
81
+ # scraping all categories
82
+ #
83
+ first = true
84
+ (agent.page.root / 'div.wsjMainNav li').each do |li|
85
+ if(first)
86
+ first = false
87
+ next
88
+ end
89
+
90
+ a = (li / 'a').first
91
+ toc_cat = []
92
+ toc_cat << canonical( a.text.strip )
93
+ begin
94
+ retry_loop( 5 ) do
95
+ agent.get(a.attr( 'href' ))
96
+ sleep 1
97
+ end
98
+ rescue
99
+ News2Kindle.logger.error "cannot get #{uri}."
100
+ raise
101
+ end
102
+
103
+ count = 0
104
+ (agent.page / "div.leadModule" ).remove
105
+ newsLinks = (agent.page / "div.headlineSummary ul.newsItem h2 a" )
106
+ newsLinks.each do |a|
107
+ if(a.attr('href') =~ /^http:\/\/jp.wsj.com\/article\//)
108
+ toc_cat << [canonical( a.text.strip ), a.attr( 'href' )]
109
+ count += 1
110
+ break if(count >= 10)
111
+ end
112
+ end
113
+ toc << toc_cat
114
+ end
115
+
116
+ begin
117
+ generate_contents( toc, agent )
118
+ yield "#{@dst_dir}/wsj-paid.opf"
119
+ end
120
+ end
121
+
122
+ private
123
+
124
+ def canonical( str )
125
+ str.gsub( /\uFF5E/, "\u301C" ) # for WAVE DASH problem
126
+ end
127
+
128
+ def retry_loop( times )
129
+ count = 0
130
+ begin
131
+ yield
132
+ rescue
133
+ count += 1
134
+ if count >= times
135
+ raise
136
+ else
137
+ News2Kindle.logger.debug $!
138
+ News2Kindle.logger.info "#{count} retry."
139
+ retry
140
+ end
141
+ end
142
+ end
143
+
144
+ def html_header( title )
145
+ <<-HTML.gsub( /^\t/, '' )
146
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
147
+ <html>
148
+ <head>
149
+ <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></meta>
150
+ <title>#{title}</title>
151
+ <link rel="stylesheet" href="wsj.css" type="text/css" media="all"></link>
152
+ </head>
153
+ <body>
154
+ <h1>#{title}</h1>
155
+ HTML
156
+ end
157
+
158
+ def get_html_item( agent, uri, sub = nil )
159
+ aid = uri2aid( uri )
160
+ html = nil
161
+ if File::exist?( "#{@src_dir}/#{aid}#{sub}.html" ) # loading cache
162
+ html = Nokogiri( open( "#{@src_dir}/#{aid}#{sub}.html", 'r:utf-8', &:read ) )
163
+ else
164
+ begin
165
+ #puts "getting html #{aid}#{sub}"
166
+ retry_loop( 5 ) do
167
+ agent.get( uri )
168
+ html = agent.page.root
169
+ sleep 1
170
+ end
171
+ rescue
172
+ News2Kindle.logger.error "cannot get #{uri}."
173
+ raise
174
+ end
175
+ open( "#{@src_dir}/#{aid}#{sub}.html", 'w:utf-8' ) do |f|
176
+ f.write( html.to_html )
177
+ end
178
+ end
179
+ html
180
+ end
181
+
182
+ def scrape_html_item( html )
183
+ contents = (html / 'div#article_story_body')
184
+
185
+ if(contents.size == 0)
186
+ contents = (html / 'div#slideContainer')
187
+ if(contents.size > 0)
188
+ (contents / 'div.dSlideViewer').before((contents / 'div.dSlideViewer li.firstSlide').inner_html)
189
+ (contents / 'div.dSlideViewer, h2.header, ul.nav-inline').remove
190
+ end
191
+ else
192
+ signature = (contents / 'ul.socialByline')
193
+ if(signature.size > 0)
194
+ signature[0].before(signature.inner_text)
195
+ signature.remove
196
+ end
197
+ (contents / 'div.insettipBox , div.insetButton').remove
198
+ (contents / 'div.insetZoomTargetBox a').remove
199
+ (contents / 'div.legacyInset div.embedType-interactive').each {|d| d.parent.remove}
200
+ end
201
+
202
+ (contents / 'img').each do |image_tag|
203
+ image_url = image_tag.attr( 'src' )
204
+ image_file = File::basename( image_url )
205
+ if(File.exist?("#{@dst_dir}/#{image_file}"))
206
+ image_tag.set_attribute("src", image_file)
207
+ next
208
+ end
209
+ begin
210
+ image = open( image_url, &:read )
211
+ open( "#{@dst_dir}/#{image_file}", 'w' ){|fp| fp.write image}
212
+ image_tag.set_attribute("src", image_file)
213
+ rescue
214
+ News2Kindle.logger.warn "FAIL TO DOWNLOAD IMAGE: #{image_url}"
215
+ end
216
+ end
217
+
218
+ contents.inner_html
219
+ end
220
+
221
+ def html_item( item, uri, agent )
222
+ aid = uri2aid( uri )
223
+ return '' unless aid
224
+ html = get_html_item( agent, uri )
225
+
226
+ open( "#{@dst_dir}/#{aid}.html", 'w:utf-8' ) do |f|
227
+ title_tag = (html / 'meta[@property="og:title"]')
228
+ title = title_tag.size > 0 ? title_tag[0].attr("content").strip : item
229
+ f.puts canonical( html_header( title ) )
230
+
231
+ f.puts scrape_html_item(html)
232
+ f.puts html_footer
233
+ end
234
+
235
+ %Q|\t\t<li><a href="#{aid}.html">#{item}</a></li>|
236
+ end
237
+
238
+ def html_footer
239
+ <<-HTML.gsub( /^\t/, '' )
240
+ </body>
241
+ </html>
242
+ HTML
243
+ end
244
+
245
+ def ncx_header
246
+ <<-XML.gsub( /^\t/, '' )
247
+ <?xml version="1.0" encoding="UTF-8"?>
248
+ <!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
249
+ <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
250
+ <docTitle><text>#{@title} (#{@now_str})</text></docTitle>
251
+ <navMap>
252
+ <navPoint id="toc" playOrder="0"><navLabel><text>Table of Contents</text></navLabel><content src="toc.html" /></navPoint>
253
+ XML
254
+ end
255
+
256
+ def ncx_item( item, uri, index )
257
+ aid = uri2aid( uri )
258
+ aid ? %Q|\t\t<navPoint id="#{aid}" playOrder="#{index}"><navLabel><text>#{item}</text></navLabel><content src="#{aid}.html" /></navPoint>| : ''
259
+ end
260
+
261
+ def ncx_footer
262
+ <<-XML.gsub( /^\t/, '' )
263
+ </navMap>
264
+ </ncx>
265
+ XML
266
+ end
267
+
268
+ def opf_header
269
+ <<-XML.gsub( /^\t/, '' )
270
+ <?xml version="1.0" encoding="utf-8"?>
271
+ <package unique-identifier="uid">
272
+ <metadata>
273
+ <dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/">
274
+ <dc:Title>#{@title} (#{@now_str})</dc:Title>
275
+ <dc:Language>#{@lang}</dc:Language>
276
+ <dc:Creator>The Wall Street Journal Online</dc:Creator>
277
+ <dc:Description>#{@title}、#{@now_str}生成</dc:Description>
278
+ <dc:Date>#{@now.strftime( '%d/%m/%Y' )}</dc:Date>
279
+ </dc-metadata>
280
+ <x-metadata>
281
+ <output encoding="utf-8" content-type="text/x-oeb1-document"></output>
282
+ <EmbeddedCover>wsj.jpg</EmbeddedCover>
283
+ </x-metadata>
284
+ </metadata>
285
+ <manifest>
286
+ <item id="toc" media-type="application/x-dtbncx+xml" href="toc.ncx"></item>
287
+ <item id="style" media-type="text/css" href="wsj.css"></item>
288
+ <item id="index" media-type="text/html" href="toc.html"></item>
289
+ XML
290
+ end
291
+
292
+ def opf_item( uri )
293
+ aid = uri2aid( uri )
294
+ aid ? %Q|\t\t<item id="#{aid}" media-type="text/html" href="#{aid}.html"></item>| : ''
295
+ end
296
+
297
+ def opf_footer( aids )
298
+ r = <<-XML.gsub( /^\t/, '' )
299
+ </manifest>
300
+ <spine toc="toc">
301
+ XML
302
+ aids.each do |aid|
303
+ r << %Q|\t<itemref idref="#{aid}" />\n|
304
+ end
305
+ r << <<-XML.gsub( /^\t/, '' )
306
+ <itemref idref="index" />
307
+ </spine>
308
+ <tours></tours>
309
+ <guide>
310
+ <reference type="toc" title="Table of Contents" href="toc.html"></reference>
311
+ <reference type="start" title="Top Story" href="#{aids[0]}.html"></reference>
312
+ </guide>
313
+ </package>
314
+ XML
315
+ r
316
+ end
317
+
318
+ def uri2aid( uri )
319
+ uri.scan( %r|/article/([^/]*).html| ).flatten[0]
320
+ end
321
+
322
+ def generate_contents( toc, agent )
323
+ open( "#{@dst_dir}/toc.html", 'w:utf-8' ) do |html|
324
+ open( "#{@dst_dir}/toc.ncx", 'w:utf-8' ) do |ncx|
325
+ open( "#{@dst_dir}/wsj-paid.opf", 'w:utf-8' ) do |opf|
326
+ first = true
327
+ toc_index = 0
328
+ aids = []
329
+ ncx.puts ncx_header
330
+ opf.puts opf_header
331
+ toc.each do |category|
332
+ category.each do |article|
333
+ if article.class == String
334
+ html.puts first ?
335
+ html_header( 'Table of Contents' ) :
336
+ "\t</ul>\n\t<mbp:pagebreak />"
337
+ html.puts "\t<h2>#{article}</h2>"
338
+ html.puts "\t<ul>"
339
+ first = false
340
+ else
341
+ html.puts html_item( article[0], article[1], agent )
342
+ ncx.puts ncx_item( article[0], article[1], toc_index += 1 )
343
+ unless aids.index( uri2aid( article[1] ) )
344
+ opf.puts opf_item( article[1] )
345
+ aids << uri2aid( article[1] ) if uri2aid( article[1] )
346
+ end
347
+ end
348
+ end
349
+ end
350
+ html.puts "\t</ul>"
351
+ html.puts html_footer
352
+ ncx.puts ncx_footer
353
+ opf.puts opf_footer( aids )
354
+ end
355
+ end
356
+ end
357
+ end
358
+ end
359
+ end
360
+ end