news2kindle 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.rspec +3 -0
- data/.tachikoma.yml +1 -0
- data/.travis.yml +18 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +119 -0
- data/README.md +59 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/bin/test-generator +21 -0
- data/exe/news2kindle +107 -0
- data/lib/news2kindle.rb +12 -0
- data/lib/news2kindle/dup_checker.rb +41 -0
- data/lib/news2kindle/generator/internet-watch.rb +236 -0
- data/lib/news2kindle/generator/nikkei-free.rb +18 -0
- data/lib/news2kindle/generator/nikkei-paid.rb +352 -0
- data/lib/news2kindle/generator/tdiary.rb +135 -0
- data/lib/news2kindle/generator/wsj-paid.rb +360 -0
- data/lib/news2kindle/generator/wsjus-paid.rb +90 -0
- data/lib/news2kindle/task.rb +116 -0
- data/lib/news2kindle/version.rb +3 -0
- data/news2kindle.gemspec +37 -0
- data/news2kindle.yaml.sample +31 -0
- data/resource/internet-watch.css +27 -0
- data/resource/internet-watch.jpg +0 -0
- data/resource/nikkei.css +43 -0
- data/resource/nikkei.jpg +0 -0
- data/resource/tdiary.css +27 -0
- data/resource/wsj-us.jpg +0 -0
- data/resource/wsj.css +19 -0
- data/resource/wsj.jpg +0 -0
- metadata +245 -0
@@ -0,0 +1,236 @@
|
|
1
|
+
# scraping internet.watch.impress.co.jp for News2Kindle
|
2
|
+
#
|
3
|
+
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'open-uri'
|
6
|
+
require 'uri'
|
7
|
+
require 'ostruct'
|
8
|
+
require 'tmpdir'
|
9
|
+
require 'pathname'
|
10
|
+
require 'fileutils'
|
11
|
+
|
12
|
+
module News2Kindle
|
13
|
+
module Generator
|
14
|
+
class InternetWatch
|
15
|
+
TOP = 'https://internet.watch.impress.co.jp'
|
16
|
+
|
17
|
+
def initialize( tmpdir )
|
18
|
+
@current_dir = tmpdir
|
19
|
+
|
20
|
+
@src_dir = @current_dir + '/src'
|
21
|
+
Dir::mkdir( @src_dir )
|
22
|
+
|
23
|
+
@dst_dir = @current_dir + '/dst'
|
24
|
+
Dir::mkdir( @dst_dir )
|
25
|
+
FileUtils.cp( "./resource/internet-watch.jpg", @dst_dir )
|
26
|
+
FileUtils.cp( "./resource/internet-watch.css", @dst_dir )
|
27
|
+
end
|
28
|
+
|
29
|
+
def generate(opts)
|
30
|
+
now = opts[:now]
|
31
|
+
items = []
|
32
|
+
|
33
|
+
rdf_file = "http://rss.rssad.jp/rss/internetwatch/internet.rdf"
|
34
|
+
rdf = retry_loop( 5 ) do
|
35
|
+
Nokogiri(open(rdf_file, 'r:utf-8', &:read))
|
36
|
+
end
|
37
|
+
(rdf / 'item' ).each do |item|
|
38
|
+
uri = URI( item.attr( 'rdf:about' ).to_s )
|
39
|
+
next unless /internet\.watch\.impress\.co\.jp/ =~ uri.host
|
40
|
+
uri.query = nil # remove query of 'ref=rss'
|
41
|
+
next if News2Kindle::DupChecker.dup?(uri)
|
42
|
+
|
43
|
+
title = (item / 'title').text
|
44
|
+
date = item.elements.map{|e| e.text if e.name == 'date'}.join
|
45
|
+
items << OpenStruct::new( :uri => uri, :title => title, :date => date )
|
46
|
+
end
|
47
|
+
items.sort!{|a,b| a.date <=> b.date}
|
48
|
+
|
49
|
+
now_str = now.strftime( '%Y-%m-%d %H:%M' )
|
50
|
+
|
51
|
+
#
|
52
|
+
# generating articles in html
|
53
|
+
#
|
54
|
+
items.each do |item|
|
55
|
+
begin
|
56
|
+
article = get_article( item.uri )
|
57
|
+
open( "#{@dst_dir}/#{item_id item.uri}.html", 'w' ) do |f|
|
58
|
+
f.puts html_header( item.title )
|
59
|
+
contents = (article / 'div.mainContents')
|
60
|
+
(contents / 'img').each do |img|
|
61
|
+
org = img.attr('ajax') || img.attr('src')
|
62
|
+
next if org =~ /^http/ # skip images on other servers
|
63
|
+
begin
|
64
|
+
img_file = retry_loop( 5 ) do
|
65
|
+
open( "#{TOP}#{org}", &:read )
|
66
|
+
end
|
67
|
+
cache = "#{org.gsub( /\//, '_' ).sub( /^_/, '' )}"
|
68
|
+
open( "#{@dst_dir}/#{cache}", 'w' ){|f| f.write img_file}
|
69
|
+
img.set_attribute( 'src', cache )
|
70
|
+
rescue OpenURI::HTTPError
|
71
|
+
News2Kindle.logger.error "skipped an image: #{TOP}#{org}"
|
72
|
+
end
|
73
|
+
end
|
74
|
+
f.puts contents.inner_html
|
75
|
+
f.puts html_footer
|
76
|
+
end
|
77
|
+
rescue
|
78
|
+
News2Kindle.logger.warn "#{$!.class}: #$!"
|
79
|
+
News2Kindle.logger.warn "skipped an article: #{item.uri}"
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
#
|
84
|
+
# generating TOC in html
|
85
|
+
#
|
86
|
+
open( "#{@dst_dir}/toc.html", 'w:utf-8' ) do |f|
|
87
|
+
f.write html_header( 'Table of Contents' )
|
88
|
+
if items.size == 0
|
89
|
+
f.puts %Q|<p>本日は記事がありません。</p>|
|
90
|
+
else
|
91
|
+
f.puts "<ul>"
|
92
|
+
items.each do |item|
|
93
|
+
f.puts %Q|\t<li><a href="#{item_id item.uri}.html">#{item.title}</a></li>|
|
94
|
+
end
|
95
|
+
f.puts "</ul>"
|
96
|
+
end
|
97
|
+
f.write html_footer
|
98
|
+
end
|
99
|
+
|
100
|
+
#
|
101
|
+
# generating TOC in ncx
|
102
|
+
#
|
103
|
+
open( "#{@dst_dir}/toc.ncx", 'w:utf-8' ) do |f|
|
104
|
+
f.write <<-XML.gsub( /^\t/, '' )
|
105
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
106
|
+
<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
|
107
|
+
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
|
108
|
+
<docTitle><text>INTERNET Watch (#{now_str})</text></docTitle>
|
109
|
+
<navMap>
|
110
|
+
<navPoint id="toc" playOrder="0"><navLabel><text>Table of Contents</text></navLabel><content src="toc.html" /></navPoint>
|
111
|
+
XML
|
112
|
+
|
113
|
+
items.each_with_index do |item, index|
|
114
|
+
f.puts %Q|\t\t<navPoint id="#{item_id item.uri}" playOrder="#{index}"><navLabel><text>#{item.title}</text></navLabel><content src="#{item_id item.uri}.html" /></navPoint>|
|
115
|
+
end
|
116
|
+
|
117
|
+
f.write <<-XML.gsub( /^\t/, '' )
|
118
|
+
</navMap>
|
119
|
+
</ncx>
|
120
|
+
XML
|
121
|
+
end
|
122
|
+
|
123
|
+
#
|
124
|
+
# generating OPF
|
125
|
+
#
|
126
|
+
open( "#{@dst_dir}/internet-watch.opf", 'w:utf-8' ) do |f|
|
127
|
+
f.write <<-XML.gsub( /^\t/, '' )
|
128
|
+
<?xml version="1.0" encoding="utf-8"?>
|
129
|
+
<package unique-identifier="uid">
|
130
|
+
<metadata>
|
131
|
+
<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/">
|
132
|
+
<dc:Title>INTERNET Watch (#{now_str})</dc:Title>
|
133
|
+
<dc:Language>ja-JP</dc:Language>
|
134
|
+
<dc:Creator>インプレス</dc:Creator>
|
135
|
+
<dc:Description>INTERNET Watch、#{now_str}生成</dc:Description>
|
136
|
+
<dc:Date>#{now.strftime( '%d/%m/%Y' )}</dc:Date>
|
137
|
+
</dc-metadata>
|
138
|
+
<x-metadata>
|
139
|
+
<output encoding="utf-8" content-type="text/x-oeb1-document"></output>
|
140
|
+
<EmbeddedCover>internet-watch.jpg</EmbeddedCover>
|
141
|
+
</x-metadata>
|
142
|
+
</metadata>
|
143
|
+
<manifest>
|
144
|
+
<item id="toc" media-type="application/x-dtbncx+xml" href="toc.ncx"></item>
|
145
|
+
<item id="style" media-type="text/css" href="internet-watch.css"></item>
|
146
|
+
<item id="index" media-type="text/html" href="toc.html"></item>
|
147
|
+
XML
|
148
|
+
|
149
|
+
items.each do |item|
|
150
|
+
f.puts %Q|\t\t<item id="#{item_id item.uri}" media-type="text/html" href="#{item_id item.uri}.html"></item>|
|
151
|
+
end
|
152
|
+
|
153
|
+
f.write <<-XML.gsub( /^\t/, '' )
|
154
|
+
</manifest>
|
155
|
+
<spine toc="toc">
|
156
|
+
<itemref idref="index" />
|
157
|
+
XML
|
158
|
+
|
159
|
+
items.each do |item|
|
160
|
+
f.puts %Q|\t<itemref idref="#{item_id item.uri}" />\n|
|
161
|
+
end
|
162
|
+
|
163
|
+
f.write <<-XML.gsub( /^\t/, '' )
|
164
|
+
</spine>
|
165
|
+
<tours></tours>
|
166
|
+
<guide>
|
167
|
+
<reference type="toc" title="Table of Contents" href="toc.html"></reference>
|
168
|
+
<reference type="start" title="Table of Contents" href="toc.html"></reference>
|
169
|
+
</guide>
|
170
|
+
</package>
|
171
|
+
XML
|
172
|
+
end
|
173
|
+
|
174
|
+
yield "#{@dst_dir}/internet-watch.opf"
|
175
|
+
end
|
176
|
+
|
177
|
+
private
|
178
|
+
|
179
|
+
def retry_loop( times )
|
180
|
+
count = 0
|
181
|
+
begin
|
182
|
+
yield
|
183
|
+
rescue
|
184
|
+
count += 1
|
185
|
+
if count >= times
|
186
|
+
raise
|
187
|
+
else
|
188
|
+
News2Kindle.logger.error $!
|
189
|
+
News2Kindle.logger.info "#{count} retry."
|
190
|
+
sleep 1
|
191
|
+
retry
|
192
|
+
end
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
def item_id( uri )
|
197
|
+
File::basename( uri.path, '.html' )
|
198
|
+
end
|
199
|
+
|
200
|
+
def get_article( uri )
|
201
|
+
cache = "#{@src_dir}/#{File::basename uri.path}"
|
202
|
+
begin
|
203
|
+
html = open( cache, &:read )
|
204
|
+
rescue Errno::ENOENT
|
205
|
+
#puts "getting article: #{uri.path}".encode( Encoding::default_external )
|
206
|
+
html = retry_loop( 5 ) do
|
207
|
+
open( uri, &:read )
|
208
|
+
end
|
209
|
+
open( cache, 'w' ){|f| f.write html }
|
210
|
+
end
|
211
|
+
Nokogiri( html.encode 'UTF-8', invalid: :replace, undef: :replace, replace: '?' )
|
212
|
+
end
|
213
|
+
|
214
|
+
def html_header( title )
|
215
|
+
<<-HTML.gsub( /^\t/, '' )
|
216
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
217
|
+
<html>
|
218
|
+
<head>
|
219
|
+
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></meta>
|
220
|
+
<title>#{title}</title>
|
221
|
+
<link rel="stylesheet" href="internet-watch.css" type="text/css" media="all"></link>
|
222
|
+
</head>
|
223
|
+
<body>
|
224
|
+
<h1>#{title}</h1>
|
225
|
+
HTML
|
226
|
+
end
|
227
|
+
|
228
|
+
def html_footer
|
229
|
+
<<-HTML.gsub( /^\t/, '' )
|
230
|
+
</body>
|
231
|
+
</html>
|
232
|
+
HTML
|
233
|
+
end
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# scraping nikkei.com (for free user) for News2Kindle
|
2
|
+
#
|
3
|
+
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'open-uri'
|
6
|
+
require 'tmpdir'
|
7
|
+
require 'pathname'
|
8
|
+
require (File.dirname(__FILE__) + '/nikkei-paid')
|
9
|
+
|
10
|
+
module News2Kindle
|
11
|
+
module Generator
|
12
|
+
class NikkeiFree < NikkeiPaid
|
13
|
+
def auth
|
14
|
+
return nil, nil
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,352 @@
|
|
1
|
+
# scraping nikkei.com (for paid user) for News2Kindle
|
2
|
+
#
|
3
|
+
|
4
|
+
require 'mechanize'
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'open-uri'
|
7
|
+
require 'tmpdir'
|
8
|
+
require 'pathname'
|
9
|
+
|
10
|
+
module News2Kindle
|
11
|
+
module Generator
|
12
|
+
class NikkeiPaid
|
13
|
+
class IllegalPage < StandardError; end
|
14
|
+
|
15
|
+
TOP = 'https://www.nikkei.com'
|
16
|
+
LOGIN = "#{TOP}/etc/accounts/login?dps=3&pageflag=top&url=http%3A%2F%2Fwww.nikkei.com%2F"
|
17
|
+
|
18
|
+
def initialize( tmpdir )
|
19
|
+
@nikkei_id, @nikkei_pw = auth
|
20
|
+
@current_dir = tmpdir
|
21
|
+
|
22
|
+
@src_dir = @current_dir + '/src'
|
23
|
+
Dir::mkdir( @src_dir )
|
24
|
+
|
25
|
+
@dst_dir = @current_dir + '/dst'
|
26
|
+
Dir::mkdir( @dst_dir )
|
27
|
+
FileUtils.cp( "./resource/nikkei.jpg", @dst_dir )
|
28
|
+
FileUtils.cp( "./resource/nikkei.css", @dst_dir )
|
29
|
+
end
|
30
|
+
|
31
|
+
def generate(opts)
|
32
|
+
@now = opts[:now]
|
33
|
+
@now_str = @now.strftime '%Y-%m-%d %H:%M'
|
34
|
+
|
35
|
+
agent = Mechanize::new
|
36
|
+
agent.set_proxy( *ENV['HTTP_PROXY'].split( /:/ ) ) if ENV['HTTP_PROXY']
|
37
|
+
|
38
|
+
toc = []
|
39
|
+
if @nikkei_id and @nikkei_pw
|
40
|
+
agent.get('https://regist.nikkei.com/ds/etc/accounts/logout')
|
41
|
+
agent.get( LOGIN )
|
42
|
+
agent.page.form_with( :name => 'autoPostForm' ).submit
|
43
|
+
agent.page.form_with( :name => 'LA7010Form01' ) do |form|
|
44
|
+
form['LA7010Form01:LA7010Email'] = @nikkei_id
|
45
|
+
form['LA7010Form01:LA7010Password'] = @nikkei_pw
|
46
|
+
form.click_button
|
47
|
+
end
|
48
|
+
agent.page.forms.first.submit
|
49
|
+
else
|
50
|
+
agent.get( TOP )
|
51
|
+
end
|
52
|
+
|
53
|
+
#
|
54
|
+
# scraping top news
|
55
|
+
#
|
56
|
+
toc_top = ['TOP NEWS']
|
57
|
+
%w(first second third fourth).each do |category|
|
58
|
+
(agent.page / "div.nx-top_news_#{category} h3 a").each do |a|
|
59
|
+
uri = a.attr('href')
|
60
|
+
next if News2Kindle::DupChecker.dup?(uri)
|
61
|
+
toc_top << [canonical( a.text.strip ), uri]
|
62
|
+
end
|
63
|
+
end
|
64
|
+
toc << toc_top
|
65
|
+
|
66
|
+
#
|
67
|
+
# scraping all categories
|
68
|
+
#
|
69
|
+
(agent.page / 'div.cmnc-genre').each do |genre|
|
70
|
+
toc_cat = []
|
71
|
+
(genre / 'h4.cmnc-genre_title a.cmnc-title_text').each do |cat|
|
72
|
+
next if /local/ =~ cat.attr( 'href' )
|
73
|
+
toc_cat << cat.text
|
74
|
+
(genre / 'li a').each do |article|
|
75
|
+
uri = article.attr('href')
|
76
|
+
next if News2Kindle::DupChecker.dup?(uri)
|
77
|
+
toc_cat << [canonical( article.text ), uri]
|
78
|
+
end
|
79
|
+
end
|
80
|
+
toc << toc_cat
|
81
|
+
end
|
82
|
+
|
83
|
+
begin
|
84
|
+
generate_contents( toc, agent )
|
85
|
+
yield "#{@dst_dir}/#{basename}.opf"
|
86
|
+
end
|
87
|
+
|
88
|
+
if @nikkei_id and @nikkei_pw
|
89
|
+
agent.get('https://regist.nikkei.com/ds/etc/accounts/logout')
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
private
|
94
|
+
|
95
|
+
def auth
|
96
|
+
require 'pit'
|
97
|
+
login = Pit::get('news2kindle', require: {
|
98
|
+
nikkei_user: 'your ID of Nikkei.',
|
99
|
+
nikkei_pass: 'your Password of Nikkei.',
|
100
|
+
})
|
101
|
+
return login[:nikkei_user], login[:nikkei_pass]
|
102
|
+
end
|
103
|
+
|
104
|
+
def basename
|
105
|
+
self.class.to_s.sub(/.*:/, '').gsub(/([A-Z])/, '-\\1').sub(/^-/, '').downcase
|
106
|
+
end
|
107
|
+
|
108
|
+
def canonical( str )
|
109
|
+
str.gsub( /\uFF5E/, "\u301C" ) # for WAVE DASH problem
|
110
|
+
end
|
111
|
+
|
112
|
+
def retry_loop( times )
|
113
|
+
count = 0
|
114
|
+
begin
|
115
|
+
yield
|
116
|
+
rescue
|
117
|
+
count += 1
|
118
|
+
if count >= times
|
119
|
+
raise
|
120
|
+
else
|
121
|
+
News2Kindle.logger.error $!
|
122
|
+
News2Kindle.logger.info "#{count} retry."
|
123
|
+
retry
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
def html_header( title )
|
129
|
+
<<~HTML
|
130
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
131
|
+
<html>
|
132
|
+
<head>
|
133
|
+
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></meta>
|
134
|
+
<title>#{title}</title>
|
135
|
+
<link rel="stylesheet" href="nikkei.css" type="text/css" media="all"></link>
|
136
|
+
</head>
|
137
|
+
<body>
|
138
|
+
<h1>#{title}</h1>
|
139
|
+
HTML
|
140
|
+
end
|
141
|
+
|
142
|
+
def get_html_item( agent, uri, sub = nil )
|
143
|
+
uri.sub!( %r|^https://www.nikkei.com|, '' )
|
144
|
+
aid = uri2aid( uri )
|
145
|
+
html = nil
|
146
|
+
if File::exist?( "#{@src_dir}/#{aid}#{sub}.html" ) # loading cache
|
147
|
+
html = Nokogiri( open( "#{@src_dir}/#{aid}#{sub}.html", 'r:utf-8', &:read ) )
|
148
|
+
else
|
149
|
+
begin
|
150
|
+
#puts "getting html #{aid}#{sub}"
|
151
|
+
retry_loop( 5 ) do
|
152
|
+
agent.get( "#{TOP}#{uri}" )
|
153
|
+
html = agent.page.root
|
154
|
+
sleep 1
|
155
|
+
end
|
156
|
+
rescue
|
157
|
+
News2Kindle.logger.error "cannot get #{TOP}#{uri}."
|
158
|
+
raise
|
159
|
+
end
|
160
|
+
open( "#{@src_dir}/#{aid}#{sub}.html", 'w:utf-8' ) do |f|
|
161
|
+
f.write( html.to_html )
|
162
|
+
end
|
163
|
+
end
|
164
|
+
html
|
165
|
+
end
|
166
|
+
|
167
|
+
def scrape_html_item( html )
|
168
|
+
result = ''
|
169
|
+
(html / 'div.cmn-article_text').each do |div|
|
170
|
+
div.children.each do |e|
|
171
|
+
#div.css('div.cmn-photo_style2 img', 'p', 'table').each do |e|
|
172
|
+
case e.name
|
173
|
+
when 'p'
|
174
|
+
next unless (e / 'a.cmnc-continue').empty?
|
175
|
+
(e / 'span.JSID_urlData').remove
|
176
|
+
para = canonical e.text.strip.sub( /^ /, '' )
|
177
|
+
result << "\t<p>#{para}</p>" unless para.empty?
|
178
|
+
when 'table'
|
179
|
+
result << e.to_html
|
180
|
+
when 'div'
|
181
|
+
e.css('img').each do |img|
|
182
|
+
image_url = img['src']
|
183
|
+
next if /^http/ =~ image_url # skip images in other server
|
184
|
+
next if /^\/\// =~ image_url # skip assets
|
185
|
+
image_file = File::basename( image_url )
|
186
|
+
begin
|
187
|
+
image = open( "#{TOP}#{image_url.sub /PN/, 'PB'}", &:read )
|
188
|
+
open( "#{@dst_dir}/#{image_file}", 'w' ){|fp| fp.write image}
|
189
|
+
result << %Q|\t<div>|
|
190
|
+
result << %Q|\t\t<img src="#{image_file}">|
|
191
|
+
result << %Q|\t\t<p>[#{e.text}]</p>| unless e.text.strip.empty?
|
192
|
+
result << %Q|\t</div>|
|
193
|
+
rescue
|
194
|
+
News2Kindle.logger.debug $!
|
195
|
+
News2Kindle.logger.warn "FAIL TO DOWNLOAD IMAGE: #{image_url}"
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
result
|
202
|
+
end
|
203
|
+
|
204
|
+
def html_item( item, uri, agent )
|
205
|
+
aid = uri2aid( uri )
|
206
|
+
return '' unless aid
|
207
|
+
html = get_html_item( agent, uri )
|
208
|
+
out_file = "#{@dst_dir}/#{aid}.html"
|
209
|
+
|
210
|
+
begin
|
211
|
+
open( out_file, 'w:utf-8' ) do |f|
|
212
|
+
f.puts canonical( html_header( (html / 'h1.cmn-article_title, h4.cmn-article_title, h2.cmn-article_title')[0].text.strip ) )
|
213
|
+
f.puts scrape_html_item( html )
|
214
|
+
(html / 'div.cmn-article_nation ul li a').map {|link|
|
215
|
+
link.attr( 'href' )
|
216
|
+
}.sort.uniq.each_with_index do |link,index|
|
217
|
+
f.puts scrape_html_item( get_html_item( agent, link, index + 2 ) )
|
218
|
+
end
|
219
|
+
f.puts html_footer
|
220
|
+
end
|
221
|
+
|
222
|
+
%Q|\t\t<li><a href="#{aid}.html">#{item}</a></li>|
|
223
|
+
rescue NoMethodError
|
224
|
+
News2Kindle.logger.debug $!
|
225
|
+
News2Kindle.logger.error "page parsing faild. #{aid}"
|
226
|
+
File.delete out_file
|
227
|
+
raise IllegalPage.new
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
def html_footer
|
232
|
+
<<~HTML
|
233
|
+
</body>
|
234
|
+
</html>
|
235
|
+
HTML
|
236
|
+
end
|
237
|
+
|
238
|
+
def ncx_header
|
239
|
+
<<~XML
|
240
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
241
|
+
<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
|
242
|
+
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
|
243
|
+
<docTitle><text>日経電子版 (#{@now_str})</text></docTitle>
|
244
|
+
<navMap>
|
245
|
+
<navPoint id="toc" playOrder="0"><navLabel><text>Table of Contents</text></navLabel><content src="toc.html" /></navPoint>
|
246
|
+
XML
|
247
|
+
end
|
248
|
+
|
249
|
+
def ncx_item( item, uri, index )
|
250
|
+
aid = uri2aid( uri )
|
251
|
+
aid ? %Q|\t\t<navPoint id="#{aid}" playOrder="#{index}"><navLabel><text>#{item}</text></navLabel><content src="#{aid}.html" /></navPoint>| : ''
|
252
|
+
end
|
253
|
+
|
254
|
+
def ncx_footer
|
255
|
+
<<~XML
|
256
|
+
</navMap>
|
257
|
+
</ncx>
|
258
|
+
XML
|
259
|
+
end
|
260
|
+
|
261
|
+
def opf_header
|
262
|
+
<<~XML
|
263
|
+
<?xml version="1.0" encoding="utf-8"?>
|
264
|
+
<package unique-identifier="uid">
|
265
|
+
<metadata>
|
266
|
+
<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/">
|
267
|
+
<dc:Title>日経電子版 (#{@now_str})</dc:Title>
|
268
|
+
<dc:Language>ja-JP</dc:Language>
|
269
|
+
<dc:Creator>日本経済新聞社</dc:Creator>
|
270
|
+
<dc:Description>日経電子版、#{@now_str}生成</dc:Description>
|
271
|
+
<dc:Date>#{@now.strftime( '%d/%m/%Y' )}</dc:Date>
|
272
|
+
</dc-metadata>
|
273
|
+
<x-metadata>
|
274
|
+
<output encoding="utf-8" content-type="text/x-oeb1-document"></output>
|
275
|
+
<EmbeddedCover>nikkei.jpg</EmbeddedCover>
|
276
|
+
</x-metadata>
|
277
|
+
</metadata>
|
278
|
+
<manifest>
|
279
|
+
<item id="toc" media-type="application/x-dtbncx+xml" href="toc.ncx"></item>
|
280
|
+
<item id="style" media-type="text/css" href="nikkei.css"></item>
|
281
|
+
<item id="index" media-type="text/html" href="toc.html"></item>
|
282
|
+
XML
|
283
|
+
end
|
284
|
+
|
285
|
+
def opf_item( uri )
|
286
|
+
aid = uri2aid( uri )
|
287
|
+
aid ? %Q|\t\t<item id="#{aid}" media-type="text/html" href="#{aid}.html"></item>| : ''
|
288
|
+
end
|
289
|
+
|
290
|
+
def opf_footer( aids )
|
291
|
+
items = aids.map{|aid| %Q|\t<itemref idref="#{aid}" />|}
|
292
|
+
<<~XML
|
293
|
+
</manifest>
|
294
|
+
<spine toc="toc">
|
295
|
+
#{items.join("\n")}
|
296
|
+
<itemref idref="index" />
|
297
|
+
</spine>
|
298
|
+
<tours></tours>
|
299
|
+
<guide>
|
300
|
+
<reference type="toc" title="Table of Contents" href="toc.html"></reference>
|
301
|
+
<reference type="start" title="Top Story" href="#{aids[0]}.html"></reference>
|
302
|
+
</guide>
|
303
|
+
</package>
|
304
|
+
XML
|
305
|
+
end
|
306
|
+
|
307
|
+
def uri2aid( uri )
|
308
|
+
uri.scan( %r|/article/([^/]*)/| ).flatten[0]
|
309
|
+
end
|
310
|
+
|
311
|
+
def generate_contents( toc, agent )
|
312
|
+
open( "#{@dst_dir}/toc.html", 'w:utf-8' ) do |html|
|
313
|
+
open( "#{@dst_dir}/toc.ncx", 'w:utf-8' ) do |ncx|
|
314
|
+
open( "#{@dst_dir}/#{basename}.opf", 'w:utf-8' ) do |opf|
|
315
|
+
first = true
|
316
|
+
toc_index = 0
|
317
|
+
aids = []
|
318
|
+
ncx.puts ncx_header
|
319
|
+
opf.puts opf_header
|
320
|
+
toc.each do |category|
|
321
|
+
category.each do |article|
|
322
|
+
if article.class == String
|
323
|
+
html.puts first ?
|
324
|
+
html_header( 'Table of Contents' ) :
|
325
|
+
"\t</ul>\n\t<mbp:pagebreak />"
|
326
|
+
html.puts "\t<h2>#{article}</h2>"
|
327
|
+
html.puts "\t<ul>"
|
328
|
+
first = false
|
329
|
+
else
|
330
|
+
begin
|
331
|
+
html.puts html_item( article[0], article[1], agent )
|
332
|
+
ncx.puts ncx_item( article[0], article[1], toc_index += 1 )
|
333
|
+
unless aids.index( uri2aid( article[1] ) )
|
334
|
+
opf.puts opf_item( article[1] )
|
335
|
+
aids << uri2aid( article[1] ) if uri2aid( article[1] )
|
336
|
+
end
|
337
|
+
rescue IllegalPage
|
338
|
+
end
|
339
|
+
end
|
340
|
+
end
|
341
|
+
end
|
342
|
+
html.puts "\t</ul>"
|
343
|
+
html.puts html_footer
|
344
|
+
ncx.puts ncx_footer
|
345
|
+
opf.puts opf_footer( aids )
|
346
|
+
end
|
347
|
+
end
|
348
|
+
end
|
349
|
+
end
|
350
|
+
end
|
351
|
+
end
|
352
|
+
end
|