nhkore 0.3.2 → 0.3.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -44,7 +44,7 @@ module NHKore
44
44
  end
45
45
 
46
46
  def self.parse_url(url,basename: nil)
47
- url = Util.strip_web_str(url)
47
+ url = Util.strip_web_str(url.to_s())
48
48
 
49
49
  raise ParseError,"cannot parse dictionary URL from URL[#{url}]" if url.empty?()
50
50
 
@@ -24,6 +24,7 @@
24
24
  require 'nhkore/article'
25
25
  require 'nhkore/article_scraper'
26
26
  require 'nhkore/cleaner'
27
+ require 'nhkore/datetime_parser'
27
28
  require 'nhkore/defn'
28
29
  require 'nhkore/dict'
29
30
  require 'nhkore/dict_scraper'
@@ -49,7 +49,10 @@ module NHKore
49
49
  end
50
50
 
51
51
  def add_article(article,key: nil,overwrite: false)
52
- key = article.url if key.nil?()
52
+ url = article.url
53
+ url = url.to_s() unless url.nil?()
54
+
55
+ key = key.nil?() ? url : key.to_s()
53
56
 
54
57
  if !overwrite
55
58
  raise ArgumentError,"duplicate article[#{key}] in articles" if @articles.key?(key)
@@ -57,7 +60,7 @@ module NHKore
57
60
  end
58
61
 
59
62
  @articles[key] = article
60
- @sha256s[article.sha256] = article.url
63
+ @sha256s[article.sha256] = url
61
64
 
62
65
  return self
63
66
  end
@@ -91,16 +94,20 @@ module NHKore
91
94
  end
92
95
 
93
96
  def update_article(article,url)
97
+ url = url.to_s() unless url.nil?()
98
+
94
99
  # Favor https.
95
- return if article.url =~ FAVORED_URL
100
+ return if article.url.to_s() =~ FAVORED_URL
96
101
  return if url !~ FAVORED_URL
97
102
 
98
- @articles.delete(article.url)
103
+ @articles.delete(article.url) # Probably no to_s() here
99
104
  @articles[url] = article
100
105
  article.url = url
101
106
  end
102
107
 
103
108
  def article(key)
109
+ key = key.to_s() unless key.nil?()
110
+
104
111
  return @articles[key]
105
112
  end
106
113
 
@@ -119,6 +126,8 @@ module NHKore
119
126
  end
120
127
 
121
128
  def article?(key)
129
+ key = key.to_s() unless key.nil?()
130
+
122
131
  return @articles.key?(key)
123
132
  end
124
133
 
@@ -21,6 +21,7 @@
21
21
  #++
22
22
 
23
23
 
24
+ require 'attr_bool'
24
25
  require 'nokogiri'
25
26
  require 'open-uri'
26
27
 
@@ -34,14 +35,16 @@ module NHKore
34
35
  # @since 0.2.0
35
36
  ###
36
37
  class Scraper
38
+ extend AttrBool::Ext
39
+
37
40
  DEFAULT_HEADER = {
38
41
  'user-agent' => UserAgents.sample(),
39
42
  'accept' => 'text/html,application/xhtml+xml,application/xml,application/rss+xml,text/xml;image/webp,image/apng,*/*;application/signed-exchange',
40
43
  'dnt' => '1',
41
44
  }
42
45
 
43
- attr_accessor :eat_cookie
44
- attr_accessor :is_file
46
+ attr_accessor? :eat_cookie
47
+ attr_accessor? :is_file
45
48
  attr_reader :kargs
46
49
  attr_accessor :max_redirects
47
50
  attr_accessor :max_retries
@@ -49,9 +52,6 @@ module NHKore
49
52
  attr_accessor :str_or_io
50
53
  attr_accessor :url
51
54
 
52
- alias_method :eat_cookie?,:eat_cookie
53
- alias_method :is_file?,:is_file
54
-
55
55
  # +max_redirects+ defaults to 3 for safety (infinite-loop attack).
56
56
  #
57
57
  # All URL options: https://ruby-doc.org/stdlib-2.7.0/libdoc/open-uri/rdoc/OpenURI/OpenRead.html
@@ -21,6 +21,7 @@
21
21
  #++
22
22
 
23
23
 
24
+ require 'attr_bool'
24
25
  require 'time'
25
26
 
26
27
  require 'nhkore/fileable'
@@ -33,14 +34,14 @@ module NHKore
33
34
  # @since 0.2.0
34
35
  ###
35
36
  class SearchLink
36
- attr_accessor :datetime
37
- attr_accessor :futsuurl
38
- attr_accessor :scraped
37
+ extend AttrBool::Ext
38
+
39
+ attr_reader :datetime
40
+ attr_reader :futsuurl
41
+ attr_accessor? :scraped
39
42
  attr_accessor :sha256
40
43
  attr_accessor :title
41
- attr_accessor :url
42
-
43
- alias_method :scraped?,:scraped
44
+ attr_reader :url
44
45
 
45
46
  def initialize(url,scraped: false)
46
47
  super()
@@ -50,29 +51,27 @@ module NHKore
50
51
  @scraped = scraped
51
52
  @sha256 = sha256
52
53
  @title = nil
53
- @url = url
54
+ self.url = url
54
55
  end
55
56
 
56
57
  def encode_with(coder)
57
58
  # Order matters.
58
59
 
59
- coder[:url] = @url
60
+ coder[:url] = @url.nil?() ? nil : @url.to_s()
60
61
  coder[:scraped] = @scraped
61
- coder[:datetime] = @datetime.nil?() ? @datetime : @datetime.iso8601()
62
+ coder[:datetime] = @datetime.nil?() ? nil : @datetime.iso8601()
62
63
  coder[:title] = @title
63
- coder[:futsuurl] = @futsuurl
64
+ coder[:futsuurl] = @futsuurl.nil?() ? nil : @futsuurl.to_s()
64
65
  coder[:sha256] = @sha256
65
66
  end
66
67
 
67
68
  def self.load_data(key,hash)
68
- datetime = hash[:datetime]
69
-
70
69
  slink = SearchLink.new(
71
70
  hash[:url],
72
- scraped: hash[:scraped]
71
+ scraped: hash[:scraped],
73
72
  )
74
73
 
75
- slink.datetime = Util.empty_web_str?(datetime) ? nil : Time.iso8601(datetime)
74
+ slink.datetime = hash[:datetime]
76
75
  slink.futsuurl = hash[:futsuurl]
77
76
  slink.sha256 = hash[:sha256]
78
77
  slink.title = hash[:title]
@@ -83,13 +82,31 @@ module NHKore
83
82
  def update_from_article(article)
84
83
  # Don't update the url, as it may be different (e.g., http vs https).
85
84
 
86
- @datetime = article.datetime if @datetime.nil?()
87
- @futsuurl = article.futsuurl if Util.empty_web_str?(@futsuurl)
85
+ self.datetime = article.datetime if @datetime.nil?()
86
+ self.futsuurl = article.futsuurl if Util.empty_web_str?(@futsuurl)
88
87
  @scraped = true # If we have an article, it's been scraped
89
88
  @sha256 = article.sha256 if Util.empty_web_str?(@sha256)
90
89
  @title = article.title if Util.empty_web_str?(@title)
91
90
  end
92
91
 
92
+ def datetime=(value)
93
+ if value.is_a?(Time)
94
+ @datetime = value
95
+ else
96
+ @datetime = Util.empty_web_str?(value) ? nil : Time.iso8601(value)
97
+ end
98
+ end
99
+
100
+ def futsuurl=(value)
101
+ # Don't store URI, store String.
102
+ @futsuurl = value.nil?() ? nil : value.to_s()
103
+ end
104
+
105
+ def url=(value)
106
+ # Don't store URI, store String.
107
+ @url = value.nil?() ? nil : value.to_s()
108
+ end
109
+
93
110
  def to_s(mini: false)
94
111
  s = ''.dup()
95
112
 
@@ -137,9 +154,11 @@ module NHKore
137
154
  end
138
155
 
139
156
  def add_link(link)
140
- return self if @links.key?(link.url)
157
+ url = link.url.nil?() ? nil : link.url.to_s()
158
+
159
+ return self if @links.key?(url)
141
160
 
142
- @links[link.url] = link
161
+ @links[url] = link
143
162
 
144
163
  return self
145
164
  end
@@ -163,7 +182,7 @@ module NHKore
163
182
 
164
183
  if !links.nil?()
165
184
  links.each() do |key,hash|
166
- key = key.to_s() # Change from a symbol
185
+ key = key.to_s() unless key.nil?()
167
186
  slinks.links[key] = SearchLink.load_data(key,hash)
168
187
  end
169
188
  end
@@ -173,6 +192,7 @@ module NHKore
173
192
 
174
193
  def [](url)
175
194
  url = url.url if url.respond_to?(:url)
195
+ url = url.to_s() unless url.nil?()
176
196
 
177
197
  return @links[url]
178
198
  end
@@ -45,6 +45,16 @@ module NHKore
45
45
  # - https://www3.nhk.or.jp/news/easy/article/disaster_heat.html
46
46
  YASASHII_REGEX = /\A[^\.]+\.#{Regexp.quote(YASASHII_SITE)}.+\.html?/i
47
47
 
48
+ IGNORE_LINK_REGEX = %r{
49
+ /about\.html? # https://www3.nhk.or.jp/news/easy/about.html
50
+ |/movieplayer\.html? # https://www3.nhk.or.jp/news/easy/movieplayer.html?id=k10038422811_1207251719_1207251728.mp4&teacuprbbs=4feb73432045dbb97c283d64d459f7cf
51
+ |/audio\.html? # https://www3.nhk.or.jp/news/easy/player/audio.html?id=k10011555691000
52
+ |/news/easy/index\.html? # http://www3.nhk.or.jp/news/easy/index.html
53
+ # https://cgi2.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10011916321000&title=日本の会社が作った鉄道の車両「あずま」がイギリスで走る
54
+ # https://www3.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10012689671000&title=「鬼滅の刃」の映画が台湾でも始まって大勢の人が見に行く
55
+ |/enqform\.html?
56
+ }x
57
+
48
58
  # Search Engines are strict, so trigger using the default HTTP header fields
49
59
  # with +header: {}+ and fetch/set the cookie using +eat_cookie: true+.
50
60
  def initialize(url,eat_cookie: true,header: {},**kargs)
@@ -57,10 +67,8 @@ module NHKore
57
67
  link = Util.unspace_web_str(link).downcase() unless cleaned
58
68
 
59
69
  return true if link.empty?()
60
- return true if link =~ /\/about\.html?/ # https://www3.nhk.or.jp/news/easy/about.html
61
- return true if link =~ /\/movieplayer\.html?/ # https://www3.nhk.or.jp/news/easy/movieplayer.html?id=k10038422811_1207251719_1207251728.mp4&teacuprbbs=4feb73432045dbb97c283d64d459f7cf
62
- return true if link =~ /\/audio\.html?/ # https://www3.nhk.or.jp/news/easy/player/audio.html?id=k10011555691000
63
- return true if link =~ /\/news\/easy\/index\.html?/ # http://www3.nhk.or.jp/news/easy/index.html
70
+
71
+ return true if IGNORE_LINK_REGEX.match?(link)
64
72
 
65
73
  return false
66
74
  end
@@ -60,6 +60,40 @@ module NHKore
60
60
  @output = nil
61
61
  end
62
62
 
63
+ def build_header()
64
+ header = []
65
+
66
+ header << 'Frequency' unless @ignores[:freq]
67
+ header << 'Word' unless @ignores[:word]
68
+ header << 'Kana' unless @ignores[:kana]
69
+ header << 'English' unless @ignores[:eng]
70
+ header << 'Definition' unless @ignores[:defn]
71
+
72
+ return header
73
+ end
74
+
75
+ def build_rows(words)
76
+ rows = []
77
+
78
+ words.each() do |word|
79
+ rows << build_word_row(word)
80
+ end
81
+
82
+ return rows
83
+ end
84
+
85
+ def build_word_row(word)
86
+ row = []
87
+
88
+ row << word.freq unless @ignores[:freq]
89
+ row << word.word unless @ignores[:word]
90
+ row << word.kana unless @ignores[:kana]
91
+ row << word.eng unless @ignores[:eng]
92
+ row << word.defn unless @ignores[:defn]
93
+
94
+ return row
95
+ end
96
+
63
97
  def filter?(article)
64
98
  return false if @filters.empty?()
65
99
 
@@ -151,26 +185,10 @@ module NHKore
151
185
  words = sift()
152
186
 
153
187
  @output = CSV.generate(headers: :first_row,write_headers: true) do |csv|
154
- row = []
155
-
156
- row << 'Frequency' unless @ignores[:freq]
157
- row << 'Word' unless @ignores[:word]
158
- row << 'Kana' unless @ignores[:kana]
159
- row << 'English' unless @ignores[:eng]
160
- row << 'Definition' unless @ignores[:defn]
161
-
162
- csv << row
188
+ csv << build_header()
163
189
 
164
190
  words.each() do |word|
165
- row = []
166
-
167
- row << word.freq unless @ignores[:freq]
168
- row << word.word unless @ignores[:word]
169
- row << word.kana unless @ignores[:kana]
170
- row << word.eng unless @ignores[:eng]
171
- row << word.defn unless @ignores[:defn]
172
-
173
- csv << row
191
+ csv << build_word_row(word)
174
192
  end
175
193
  end
176
194
 
@@ -232,7 +250,7 @@ module NHKore
232
250
  <h2>#{@caption}</h2>
233
251
  <table>
234
252
  EOH
235
- #" # Fix for editor
253
+ #"
236
254
 
237
255
  # If have too few or too many '<col>', invalid HTML.
238
256
  @output << %Q{<col style="width:6em;">\n} unless @ignores[:freq]
@@ -242,20 +260,20 @@ module NHKore
242
260
  @output << "<col>\n" unless @ignores[:defn] # No width for defn, fills rest of page
243
261
 
244
262
  @output << '<tr>'
245
- @output << '<th>Frequency</th>' unless @ignores[:freq]
246
- @output << '<th>Word</th>' unless @ignores[:word]
247
- @output << '<th>Kana</th>' unless @ignores[:kana]
248
- @output << '<th>English</th>' unless @ignores[:eng]
249
- @output << '<th>Definition</th>' unless @ignores[:defn]
263
+
264
+ build_header().each() do |h|
265
+ @output << "<th>#{h}</th>"
266
+ end
267
+
250
268
  @output << "</tr>\n"
251
269
 
252
270
  words.each() do |word|
253
271
  @output << '<tr>'
254
- @output << "<td>#{Util.escape_html(word.freq.to_s())}</td>" unless @ignores[:freq]
255
- @output << "<td>#{Util.escape_html(word.word.to_s())}</td>" unless @ignores[:word]
256
- @output << "<td>#{Util.escape_html(word.kana.to_s())}</td>" unless @ignores[:kana]
257
- @output << "<td>#{Util.escape_html(word.eng.to_s())}</td>" unless @ignores[:eng]
258
- @output << "<td>#{Util.escape_html(word.defn.to_s())}</td>" unless @ignores[:defn]
272
+
273
+ build_word_row(word).each() do |w|
274
+ @output << "<td>#{Util.escape_html(w.to_s())}</td>"
275
+ end
276
+
259
277
  @output << "</tr>\n"
260
278
  end
261
279
 
@@ -264,31 +282,63 @@ module NHKore
264
282
  </body>
265
283
  </html>
266
284
  EOH
267
- #/ # Fix for editor
285
+ #/
268
286
 
269
287
  return @output
270
288
  end
271
289
 
272
- def put_yaml!()
290
+ def put_json!()
291
+ require 'json'
292
+
273
293
  words = sift()
274
294
 
275
- # Just blank out ignores.
276
- if !@ignores.empty?()
277
- words.each() do |word|
278
- # word/kanji/kana do not have setters/mutators.
279
- word.defn = nil if @ignores[:defn]
280
- word.eng = nil if @ignores[:eng]
281
- word.freq = nil if @ignores[:freq]
295
+ @output = ''.dup()
296
+
297
+ @output << <<~EOJ
298
+ {
299
+ "caption": #{JSON.generate(@caption)},
300
+ "header": #{JSON.generate(build_header())},
301
+ "words": [
302
+ EOJ
303
+
304
+ if !words.empty?()
305
+ 0.upto(words.length - 2) do |i|
306
+ @output << " #{JSON.generate(build_word_row(words[i]))},\n"
282
307
  end
308
+
309
+ @output << " #{JSON.generate(build_word_row(words[-1]))}\n"
283
310
  end
284
311
 
312
+ @output << "]\n}\n"
313
+
314
+ return @output
315
+ end
316
+
317
+ def put_yaml!()
318
+ require 'psychgus'
319
+
320
+ words = sift()
321
+
285
322
  yaml = {
286
323
  caption: @caption,
287
- words: words
324
+ header: build_header(),
325
+ words: build_rows(words),
288
326
  }
289
327
 
328
+ header_styler = Class.new() do
329
+ include Psychgus::Styler
330
+
331
+ def style_sequence(sniffer,node)
332
+ parent = sniffer.parent
333
+
334
+ if !parent.nil?() && parent.node.respond_to?(:value) && parent.value == 'header'
335
+ node.style = Psychgus::SEQUENCE_FLOW
336
+ end
337
+ end
338
+ end
339
+
290
340
  # Put each Word on one line (flow/inline style).
291
- @output = Util.dump_yaml(yaml,flow_level: 4)
341
+ @output = Util.dump_yaml(yaml,flow_level: 4,stylers: header_styler.new())
292
342
 
293
343
  return @output
294
344
  end
@@ -306,7 +356,7 @@ module NHKore
306
356
 
307
357
  words = master_article.words.values()
308
358
 
309
- words = words.sort() do |word1,word2|
359
+ words.sort!() do |word1,word2|
310
360
  # Order by freq DESC (most frequent words to top).
311
361
  i = (word2.freq <=> word1.freq)
312
362