nhkore 0.3.7 → 0.3.11

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,23 +1,11 @@
1
- #!/usr/bin/env ruby
2
1
  # encoding: UTF-8
3
2
  # frozen_string_literal: true
4
3
 
5
4
  #--
6
5
  # This file is part of NHKore.
7
- # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
- #
9
- # NHKore is free software: you can redistribute it and/or modify
10
- # it under the terms of the GNU Lesser General Public License as published by
11
- # the Free Software Foundation, either version 3 of the License, or
12
- # (at your option) any later version.
13
- #
14
- # NHKore is distributed in the hope that it will be useful,
15
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
- # GNU Lesser General Public License for more details.
18
- #
19
- # You should have received a copy of the GNU Lesser General Public License
20
- # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
6
+ # Copyright (c) 2020-2021 Jonathan Bradley Whited
7
+ #
8
+ # SPDX-License-Identifier: LGPL-3.0-or-later
21
9
  #++
22
10
 
23
11
 
@@ -30,22 +18,22 @@ require 'nhkore/util'
30
18
 
31
19
  module NHKore
32
20
  ###
33
- # @author Jonathan Bradley Whited (@esotericpig)
21
+ # @author Jonathan Bradley Whited
34
22
  # @since 0.2.0
35
23
  ###
36
24
  class SearchLink
37
25
  extend AttrBool::Ext
38
-
26
+
39
27
  attr_reader :datetime
40
28
  attr_reader :futsuurl
41
29
  attr_accessor? :scraped
42
30
  attr_accessor :sha256
43
31
  attr_accessor :title
44
32
  attr_reader :url
45
-
33
+
46
34
  def initialize(url,scraped: false)
47
35
  super()
48
-
36
+
49
37
  @datetime = nil
50
38
  @futsuurl = nil
51
39
  @scraped = scraped
@@ -53,42 +41,42 @@ module NHKore
53
41
  @title = nil
54
42
  self.url = url
55
43
  end
56
-
44
+
57
45
  def encode_with(coder)
58
46
  # Order matters.
59
-
60
- coder[:url] = @url.nil?() ? nil : @url.to_s()
47
+
48
+ coder[:url] = @url.nil? ? nil : @url.to_s
61
49
  coder[:scraped] = @scraped
62
- coder[:datetime] = @datetime.nil?() ? nil : @datetime.iso8601()
50
+ coder[:datetime] = @datetime.nil? ? nil : @datetime.iso8601
63
51
  coder[:title] = @title
64
- coder[:futsuurl] = @futsuurl.nil?() ? nil : @futsuurl.to_s()
52
+ coder[:futsuurl] = @futsuurl.nil? ? nil : @futsuurl.to_s
65
53
  coder[:sha256] = @sha256
66
54
  end
67
-
55
+
68
56
  def self.load_data(key,hash)
69
57
  slink = SearchLink.new(
70
58
  hash[:url],
71
59
  scraped: hash[:scraped],
72
60
  )
73
-
61
+
74
62
  slink.datetime = hash[:datetime]
75
63
  slink.futsuurl = hash[:futsuurl]
76
64
  slink.sha256 = hash[:sha256]
77
65
  slink.title = hash[:title]
78
-
66
+
79
67
  return slink
80
68
  end
81
-
69
+
82
70
  def update_from_article(article)
83
71
  # Don't update the url, as it may be different (e.g., http vs https).
84
-
85
- self.datetime = article.datetime if @datetime.nil?()
72
+
73
+ self.datetime = article.datetime if @datetime.nil?
86
74
  self.futsuurl = article.futsuurl if Util.empty_web_str?(@futsuurl)
87
75
  @scraped = true # If we have an article, it's been scraped
88
76
  @sha256 = article.sha256 if Util.empty_web_str?(@sha256)
89
77
  @title = article.title if Util.empty_web_str?(@title)
90
78
  end
91
-
79
+
92
80
  def datetime=(value)
93
81
  if value.is_a?(Time)
94
82
  @datetime = value
@@ -96,22 +84,22 @@ module NHKore
96
84
  @datetime = Util.empty_web_str?(value) ? nil : Time.iso8601(value)
97
85
  end
98
86
  end
99
-
87
+
100
88
  def futsuurl=(value)
101
89
  # Don't store URI, store String.
102
- @futsuurl = value.nil?() ? nil : value.to_s()
90
+ @futsuurl = value.nil? ? nil : value.to_s
103
91
  end
104
-
92
+
105
93
  def url=(value)
106
94
  # Don't store URI, store String.
107
- @url = value.nil?() ? nil : value.to_s()
95
+ @url = value.nil? ? nil : value.to_s
108
96
  end
109
-
97
+
110
98
  def to_s(mini: false)
111
- s = ''.dup()
112
-
99
+ s = ''.dup
100
+
113
101
  s << "'#{@url}': "
114
-
102
+
115
103
  if mini
116
104
  s << "{ scraped? #{@scraped ? 'yes' : 'NO'} }"
117
105
  else
@@ -121,87 +109,85 @@ module NHKore
121
109
  s << "\n futsuurl: '#{@futsuurl}'"
122
110
  s << "\n sha256: '#{@sha256}'"
123
111
  end
124
-
112
+
125
113
  return s
126
114
  end
127
115
  end
128
-
116
+
129
117
  ###
130
- # @author Jonathan Bradley Whited (@esotericpig)
118
+ # @author Jonathan Bradley Whited
131
119
  # @since 0.2.0
132
120
  ###
133
121
  class SearchLinks
134
122
  include Fileable
135
-
123
+
136
124
  DEFAULT_DIR = Util::CORE_DIR
137
-
125
+
138
126
  DEFAULT_FUTSUU_FILENAME = 'links_nhk_news_web_regular.yml'
139
127
  DEFAULT_YASASHII_FILENAME = 'links_nhk_news_web_easy.yml'
140
-
128
+
141
129
  def self.build_file(filename)
142
130
  return File.join(DEFAULT_DIR,filename)
143
131
  end
144
-
132
+
145
133
  DEFAULT_FUTSUU_FILE = build_file(DEFAULT_FUTSUU_FILENAME)
146
134
  DEFAULT_YASASHII_FILE = build_file(DEFAULT_YASASHII_FILENAME)
147
-
135
+
148
136
  attr_reader :links
149
-
150
- def initialize()
137
+
138
+ def initialize
151
139
  super()
152
-
140
+
153
141
  @links = {}
154
142
  end
155
-
143
+
156
144
  def add_link(link)
157
- url = link.url.nil?() ? nil : link.url.to_s()
158
-
145
+ url = link.url.nil? ? nil : link.url.to_s
146
+
159
147
  return self if @links.key?(url)
160
-
148
+
161
149
  @links[url] = link
162
-
150
+
163
151
  return self
164
152
  end
165
-
153
+
166
154
  def each(&block)
167
155
  return @links.each(&block)
168
156
  end
169
-
157
+
170
158
  def encode_with(coder)
171
159
  # Order matters.
172
-
160
+
173
161
  coder[:links] = @links
174
162
  end
175
-
163
+
176
164
  def self.load_data(data,file: nil,**kargs)
177
165
  data = Util.load_yaml(data,file: file)
178
-
166
+
179
167
  links = data[:links]
180
-
181
- slinks = SearchLinks.new()
182
-
183
- if !links.nil?()
184
- links.each() do |key,hash|
185
- key = key.to_s() unless key.nil?()
186
- slinks.links[key] = SearchLink.load_data(key,hash)
187
- end
168
+
169
+ slinks = SearchLinks.new
170
+
171
+ links&.each() do |key,hash|
172
+ key = key.to_s unless key.nil?
173
+ slinks.links[key] = SearchLink.load_data(key,hash)
188
174
  end
189
-
175
+
190
176
  return slinks
191
177
  end
192
-
178
+
193
179
  def [](url)
194
180
  url = url.url if url.respond_to?(:url)
195
- url = url.to_s() unless url.nil?()
196
-
181
+ url = url.to_s unless url.nil?
182
+
197
183
  return @links[url]
198
184
  end
199
-
200
- def length()
185
+
186
+ def length
201
187
  return @links.length
202
188
  end
203
-
204
- def to_s()
189
+
190
+ def to_s
205
191
  return Util.dump_yaml(self)
206
192
  end
207
193
  end
@@ -1,23 +1,11 @@
1
- #!/usr/bin/env ruby
2
1
  # encoding: UTF-8
3
2
  # frozen_string_literal: true
4
3
 
5
4
  #--
6
5
  # This file is part of NHKore.
7
- # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
- #
9
- # NHKore is free software: you can redistribute it and/or modify
10
- # it under the terms of the GNU Lesser General Public License as published by
11
- # the Free Software Foundation, either version 3 of the License, or
12
- # (at your option) any later version.
13
- #
14
- # NHKore is distributed in the hope that it will be useful,
15
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
- # GNU Lesser General Public License for more details.
18
- #
19
- # You should have received a copy of the GNU Lesser General Public License
20
- # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
6
+ # Copyright (c) 2020-2021 Jonathan Bradley Whited
7
+ #
8
+ # SPDX-License-Identifier: LGPL-3.0-or-later
21
9
  #++
22
10
 
23
11
 
@@ -31,197 +19,198 @@ require 'nhkore/util'
31
19
 
32
20
  module NHKore
33
21
  ###
34
- # @author Jonathan Bradley Whited (@esotericpig)
22
+ # @author Jonathan Bradley Whited
35
23
  # @since 0.2.0
36
24
  ###
37
25
  class SearchScraper < Scraper
38
26
  DEFAULT_RESULT_COUNT = 100
39
27
  FUTSUU_SITE = 'nhk.or.jp/news/html/'
40
28
  YASASHII_SITE = 'nhk.or.jp/news/easy/'
41
-
29
+
42
30
  # https://www3.nhk.or.jp/news/html/20200220/k10012294001000.html
43
- FUTSUU_REGEX = /\A[^\.]+\.#{Regexp.quote(FUTSUU_SITE)}.+\.html?/i
31
+ FUTSUU_REGEX = /\A[^.]+\.#{Regexp.quote(FUTSUU_SITE)}.+\.html?/i.freeze
44
32
  # https://www3.nhk.or.jp/news/easy/k10012294001000/k10012294001000.html
45
33
  # - https://www3.nhk.or.jp/news/easy/article/disaster_heat.html
46
- YASASHII_REGEX = /\A[^\.]+\.#{Regexp.quote(YASASHII_SITE)}.+\.html?/i
47
-
34
+ YASASHII_REGEX = /\A[^.]+\.#{Regexp.quote(YASASHII_SITE)}.+\.html?/i.freeze
35
+
48
36
  IGNORE_LINK_REGEX = %r{
49
- /about\.html? # https://www3.nhk.or.jp/news/easy/about.html
37
+ /about\.html? # https://www3.nhk.or.jp/news/easy/about.html
50
38
  |/movieplayer\.html? # https://www3.nhk.or.jp/news/easy/movieplayer.html?id=k10038422811_1207251719_1207251728.mp4&teacuprbbs=4feb73432045dbb97c283d64d459f7cf
51
39
  |/audio\.html? # https://www3.nhk.or.jp/news/easy/player/audio.html?id=k10011555691000
52
40
  |/news/easy/index\.html? # http://www3.nhk.or.jp/news/easy/index.html
41
+
53
42
  # https://cgi2.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10011916321000&title=日本の会社が作った鉄道の車両「あずま」がイギリスで走る
54
43
  # https://www3.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10012689671000&title=「鬼滅の刃」の映画が台湾でも始まって大勢の人が見に行く
55
44
  |/enqform\.html?
56
- }x
57
-
45
+ }x.freeze
46
+
58
47
  # Search Engines are strict, so trigger using the default HTTP header fields
59
48
  # with +header: {}+ and fetch/set the cookie using +eat_cookie: true+.
60
49
  def initialize(url,eat_cookie: true,header: {},**kargs)
61
50
  super(url,eat_cookie: eat_cookie,header: header,**kargs)
62
51
  end
63
-
52
+
64
53
  def ignore_link?(link,cleaned: true)
65
- return true if link.nil?()
66
-
67
- link = Util.unspace_web_str(link).downcase() unless cleaned
68
-
69
- return true if link.empty?()
70
-
54
+ return true if link.nil?
55
+
56
+ link = Util.unspace_web_str(link).downcase unless cleaned
57
+
58
+ return true if link.empty?
59
+
71
60
  return true if IGNORE_LINK_REGEX.match?(link)
72
-
61
+
73
62
  return false
74
63
  end
75
64
  end
76
-
65
+
77
66
  ###
78
- # @author Jonathan Bradley Whited (@esotericpig)
67
+ # @author Jonathan Bradley Whited
79
68
  # @since 0.2.0
80
69
  ###
81
70
  class BingScraper < SearchScraper
82
71
  attr_reader :regex
83
72
  attr_reader :site
84
-
73
+
85
74
  def initialize(site,regex: nil,url: nil,**kargs)
86
75
  case site
87
76
  when :futsuu
88
- regex = FUTSUU_REGEX if regex.nil?()
77
+ regex = FUTSUU_REGEX if regex.nil?
89
78
  site = FUTSUU_SITE
90
79
  when :yasashii
91
- regex = YASASHII_REGEX if regex.nil?()
80
+ regex = YASASHII_REGEX if regex.nil?
92
81
  site = YASASHII_SITE
93
82
  else
94
83
  raise ArgumentError,"invalid site[#{site}]"
95
84
  end
96
-
97
- raise ArgumentError,"empty regex[#{regex}]" if regex.nil?()
98
-
85
+
86
+ raise ArgumentError,"empty regex[#{regex}]" if regex.nil?
87
+
99
88
  @regex = regex
100
89
  @site = site
101
- url = self.class.build_url(site,**kargs) if url.nil?()
102
-
90
+ url = self.class.build_url(site,**kargs) if url.nil?
91
+
103
92
  # Delete class-specific args (don't pass to Open-URI).
104
93
  kargs.delete(:count)
105
-
94
+
106
95
  super(url,**kargs)
107
96
  end
108
-
97
+
109
98
  def self.build_url(site,count: DEFAULT_RESULT_COUNT,**kargs)
110
- url = ''.dup()
111
-
99
+ url = ''.dup
100
+
112
101
  url << 'https://www.bing.com/search?'
113
102
  url << URI.encode_www_form(
114
103
  q: "site:#{site}",
115
104
  count: count
116
105
  )
117
-
106
+
118
107
  return url
119
108
  end
120
-
109
+
121
110
  def scrape(slinks,page=NextPage.new())
122
111
  next_page,link_count = scrape_html(slinks,page)
123
-
112
+
124
113
  if link_count <= 0
125
114
  scrape_rss(slinks,page,next_page)
126
115
  end
127
-
116
+
128
117
  return next_page
129
118
  end
130
-
119
+
131
120
  def scrape_html(slinks,page,next_page=NextPage.new())
132
- doc = html_doc()
121
+ doc = html_doc
133
122
  link_count = 0
134
-
123
+
135
124
  anchors = doc.css('a')
136
-
137
- anchors.each() do |anchor|
138
- href = anchor['href'].to_s()
139
- href = Util.unspace_web_str(href).downcase()
140
-
125
+
126
+ anchors.each do |anchor|
127
+ href = anchor['href'].to_s
128
+ href = Util.unspace_web_str(href).downcase
129
+
141
130
  next if ignore_link?(href)
142
-
143
- if (md = href.match(/first\=(\d+)/))
144
- count = md[1].to_i()
145
-
131
+
132
+ if (md = href.match(/first=(\d+)/))
133
+ count = md[1].to_i
134
+
146
135
  if count > page.count && (next_page.count < 0 || count < next_page.count)
147
136
  next_page.count = count
148
137
  next_page.url = join_url(href)
149
138
  end
150
139
  elsif href =~ regex
151
140
  slinks.add_link(SearchLink.new(href))
152
-
141
+
153
142
  link_count += 1
154
143
  end
155
144
  end
156
-
145
+
157
146
  return [next_page,link_count]
158
147
  end
159
-
148
+
160
149
  def scrape_rss(slinks,page,next_page=NextPage.new())
161
150
  link_count = 0
162
-
151
+
163
152
  if !@is_file
164
153
  uri = URI(@url)
165
-
154
+
166
155
  Util.replace_uri_query!(uri,format: 'rss')
167
- open(uri)
168
-
169
- doc = rss_doc()
156
+ self.open(uri)
157
+
158
+ doc = rss_doc
170
159
  rss_links = []
171
-
172
- doc.items.each() do |item|
173
- link = item.link.to_s()
174
- link = Util.unspace_web_str(link).downcase()
175
-
160
+
161
+ doc.items.each do |item|
162
+ link = item.link.to_s
163
+ link = Util.unspace_web_str(link).downcase
164
+
176
165
  rss_links << link
177
-
166
+
178
167
  next if ignore_link?(link)
179
168
  next if link !~ regex
180
-
169
+
181
170
  slinks.add_link(SearchLink.new(link))
182
-
171
+
183
172
  link_count += 1
184
173
  end
185
-
174
+
186
175
  # For RSS, Bing will keep returning the same links over and over
187
176
  # if it's the last page or the "first=" query is the wrong count.
188
177
  # Therefore, we have to test the previous RSS links (+page.rss_links+).
189
- if next_page.empty?() && doc.items.length >= 1 && page.rss_links != rss_links
178
+ if next_page.empty? && doc.items.length >= 1 && page.rss_links != rss_links
190
179
  next_page.count = (page.count < 0) ? 0 : page.count
191
180
  next_page.count += doc.items.length
192
181
  next_page.rss_links = rss_links
193
-
194
- uri = URI(page.url.nil?() ? @url : page.url)
195
-
182
+
183
+ uri = URI(page.url.nil? ? @url : page.url)
184
+
196
185
  Util.replace_uri_query!(uri,first: next_page.count)
197
-
186
+
198
187
  next_page.url = uri
199
188
  end
200
189
  end
201
-
190
+
202
191
  return [next_page,link_count]
203
192
  end
204
193
  end
205
-
194
+
206
195
  ###
207
- # @author Jonathan Bradley Whited (@esotericpig)
196
+ # @author Jonathan Bradley Whited
208
197
  # @since 0.2.0
209
198
  ###
210
199
  class NextPage
211
200
  attr_accessor :count
212
201
  attr_accessor :rss_links
213
202
  attr_accessor :url
214
-
215
- def initialize()
203
+
204
+ def initialize
216
205
  super()
217
-
206
+
218
207
  @count = -1
219
208
  @rss_links = nil
220
209
  @url = nil
221
210
  end
222
-
223
- def empty?()
224
- return @url.nil?() || @count < 0
211
+
212
+ def empty?
213
+ return @url.nil? || @count < 0
225
214
  end
226
215
  end
227
216
  end