nhkore 0.3.7 → 0.3.11

Sign up to get free protection for your applications and to get access to all the features.
data/lib/nhkore/news.rb CHANGED
@@ -1,23 +1,11 @@
1
- #!/usr/bin/env ruby
2
1
  # encoding: UTF-8
3
2
  # frozen_string_literal: true
4
3
 
5
4
  #--
6
5
  # This file is part of NHKore.
7
- # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
- #
9
- # NHKore is free software: you can redistribute it and/or modify
10
- # it under the terms of the GNU Lesser General Public License as published by
11
- # the Free Software Foundation, either version 3 of the License, or
12
- # (at your option) any later version.
13
- #
14
- # NHKore is distributed in the hope that it will be useful,
15
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
- # GNU Lesser General Public License for more details.
18
- #
19
- # You should have received a copy of the GNU Lesser General Public License
20
- # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
6
+ # Copyright (c) 2020-2021 Jonathan Bradley Whited
7
+ #
8
+ # SPDX-License-Identifier: LGPL-3.0-or-later
21
9
  #++
22
10
 
23
11
 
@@ -29,155 +17,153 @@ require 'nhkore/util'
29
17
 
30
18
  module NHKore
31
19
  ###
32
- # @author Jonathan Bradley Whited (@esotericpig)
20
+ # @author Jonathan Bradley Whited
33
21
  # @since 0.2.0
34
22
  ###
35
23
  class News
36
24
  include Fileable
37
-
25
+
38
26
  DEFAULT_DIR = Util::CORE_DIR
39
- FAVORED_URL = /https\:/i
40
-
27
+ FAVORED_URL = /https:/i.freeze
28
+
41
29
  attr_reader :articles
42
30
  attr_reader :sha256s
43
-
44
- def initialize()
31
+
32
+ def initialize
45
33
  super()
46
-
34
+
47
35
  @articles = {}
48
36
  @sha256s = {}
49
37
  end
50
-
38
+
51
39
  def add_article(article,key: nil,overwrite: false)
52
40
  url = article.url
53
- url = url.to_s() unless url.nil?()
54
-
55
- key = key.nil?() ? url : key.to_s()
56
-
41
+ url = url.to_s unless url.nil?
42
+
43
+ key = key.nil? ? url : key.to_s
44
+
57
45
  if !overwrite
58
46
  raise ArgumentError,"duplicate article[#{key}] in articles" if @articles.key?(key)
59
47
  raise ArgumentError,"duplicate sha256[#{article.sha256}] in articles" if @sha256s.key?(article.sha256)
60
48
  end
61
-
49
+
62
50
  @articles[key] = article
63
51
  @sha256s[article.sha256] = url
64
-
52
+
65
53
  return self
66
54
  end
67
-
55
+
68
56
  def self.build_file(filename)
69
57
  return File.join(DEFAULT_DIR,filename)
70
58
  end
71
-
59
+
72
60
  def encode_with(coder)
73
61
  # Order matters.
74
62
  # Don't output @sha256s.
75
-
63
+
76
64
  coder[:articles] = @articles
77
65
  end
78
-
66
+
79
67
  def self.load_data(data,article_class: Article,file: nil,news_class: News,overwrite: false,**kargs)
80
68
  data = Util.load_yaml(data,file: file)
81
-
69
+
82
70
  articles = data[:articles]
83
-
84
- news = news_class.new()
85
-
86
- if !articles.nil?()
87
- articles.each() do |key,hash|
88
- key = key.to_s() # Change from a symbol
89
- news.add_article(article_class.load_data(key,hash),key: key,overwrite: overwrite)
90
- end
71
+
72
+ news = news_class.new
73
+
74
+ articles&.each() do |key,hash|
75
+ key = key.to_s # Change from a symbol
76
+ news.add_article(article_class.load_data(key,hash),key: key,overwrite: overwrite)
91
77
  end
92
-
78
+
93
79
  return news
94
80
  end
95
-
81
+
96
82
  def update_article(article,url)
97
- url = url.to_s() unless url.nil?()
98
-
83
+ url = url.to_s unless url.nil?
84
+
99
85
  # Favor https.
100
- return if article.url.to_s() =~ FAVORED_URL
86
+ return if article.url.to_s =~ FAVORED_URL
101
87
  return if url !~ FAVORED_URL
102
-
88
+
103
89
  @articles.delete(article.url) # Probably no to_s() here
104
90
  @articles[url] = article
105
91
  article.url = url
106
92
  end
107
-
93
+
108
94
  def article(key)
109
- key = key.to_s() unless key.nil?()
110
-
95
+ key = key.to_s unless key.nil?
96
+
111
97
  return @articles[key]
112
98
  end
113
-
99
+
114
100
  def article_with_sha256(sha256)
115
101
  article = nil
116
-
117
- @articles.values().each() do |a|
102
+
103
+ @articles.each_value do |a|
118
104
  if a.sha256 == sha256
119
105
  article = a
120
-
106
+
121
107
  break
122
108
  end
123
109
  end
124
-
110
+
125
111
  return article
126
112
  end
127
-
113
+
128
114
  def article?(key)
129
- key = key.to_s() unless key.nil?()
130
-
115
+ key = key.to_s unless key.nil?
116
+
131
117
  return @articles.key?(key)
132
118
  end
133
-
119
+
134
120
  def sha256?(sha256)
135
121
  return @sha256s.key?(sha256)
136
122
  end
137
-
138
- def to_s()
123
+
124
+ def to_s
139
125
  # Put each Word on one line (flow/inline style).
140
126
  return Util.dump_yaml(self,flow_level: 8)
141
127
  end
142
128
  end
143
-
129
+
144
130
  ###
145
- # @author Jonathan Bradley Whited (@esotericpig)
131
+ # @author Jonathan Bradley Whited
146
132
  # @since 0.2.0
147
133
  ###
148
134
  class FutsuuNews < News
149
135
  DEFAULT_FILENAME = 'nhk_news_web_regular.yml'
150
136
  DEFAULT_FILE = build_file(DEFAULT_FILENAME)
151
-
137
+
152
138
  def self.load_data(data,**kargs)
153
139
  return News.load_data(data,article_class: Article,news_class: FutsuuNews,**kargs)
154
140
  end
155
-
141
+
156
142
  def self.load_file(file=DEFAULT_FILE,**kargs)
157
143
  return News.load_file(file,article_class: Article,news_class: FutsuuNews,**kargs)
158
144
  end
159
-
145
+
160
146
  def save_file(file=DEFAULT_FILE,**kargs)
161
147
  super(file,**kargs)
162
148
  end
163
149
  end
164
-
150
+
165
151
  ###
166
- # @author Jonathan Bradley Whited (@esotericpig)
152
+ # @author Jonathan Bradley Whited
167
153
  # @since 0.2.0
168
154
  ###
169
155
  class YasashiiNews < News
170
156
  DEFAULT_FILENAME = 'nhk_news_web_easy.yml'
171
157
  DEFAULT_FILE = build_file(DEFAULT_FILENAME)
172
-
158
+
173
159
  def self.load_data(data,**kargs)
174
160
  return News.load_data(data,article_class: Article,news_class: YasashiiNews,**kargs)
175
161
  end
176
-
162
+
177
163
  def self.load_file(file=DEFAULT_FILE,**kargs)
178
164
  return News.load_file(file,article_class: Article,news_class: YasashiiNews,**kargs)
179
165
  end
180
-
166
+
181
167
  def save_file(file=DEFAULT_FILE,**kargs)
182
168
  super(file,**kargs)
183
169
  end
@@ -1,23 +1,11 @@
1
- #!/usr/bin/env ruby
2
1
  # encoding: UTF-8
3
2
  # frozen_string_literal: true
4
3
 
5
4
  #--
6
5
  # This file is part of NHKore.
7
- # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
- #
9
- # NHKore is free software: you can redistribute it and/or modify
10
- # it under the terms of the GNU Lesser General Public License as published by
11
- # the Free Software Foundation, either version 3 of the License, or
12
- # (at your option) any later version.
13
- #
14
- # NHKore is distributed in the hope that it will be useful,
15
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
- # GNU Lesser General Public License for more details.
18
- #
19
- # You should have received a copy of the GNU Lesser General Public License
20
- # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
6
+ # Copyright (c) 2020-2021 Jonathan Bradley Whited
7
+ #
8
+ # SPDX-License-Identifier: LGPL-3.0-or-later
21
9
  #++
22
10
 
23
11
 
@@ -26,28 +14,28 @@ require 'nhkore/word'
26
14
 
27
15
  module NHKore
28
16
  ###
29
- # @author Jonathan Bradley Whited (@esotericpig)
17
+ # @author Jonathan Bradley Whited
30
18
  # @since 0.2.0
31
19
  ###
32
20
  class Polisher
33
21
  def begin_polish(str)
34
22
  return str
35
23
  end
36
-
24
+
37
25
  def polish(str)
38
26
  str = begin_polish(str)
39
27
  str = end_polish(str)
40
-
28
+
41
29
  return str
42
30
  end
43
-
31
+
44
32
  def self.polish_any(obj,polishers)
45
- return nil if obj.nil?()
46
-
33
+ return nil if obj.nil?
34
+
47
35
  polishers = Array(polishers)
48
-
49
- return obj if polishers.empty?()
50
-
36
+
37
+ return obj if polishers.empty?
38
+
51
39
  if obj.is_a?(Word)
52
40
  obj = Word.new(
53
41
  kana: polish_any(obj.kana,polishers),
@@ -55,17 +43,17 @@ module NHKore
55
43
  word: obj
56
44
  )
57
45
  else # String
58
- polishers.each() do |polisher|
46
+ polishers.each do |polisher|
59
47
  obj = polisher.polish(obj)
60
48
  end
61
49
  end
62
-
50
+
63
51
  return obj
64
52
  end
65
53
  end
66
-
54
+
67
55
  ###
68
- # @author Jonathan Bradley Whited (@esotericpig)
56
+ # @author Jonathan Bradley Whited
69
57
  # @since 0.2.0
70
58
  ###
71
59
  class BasicPolisher < Polisher
@@ -74,18 +62,18 @@ module NHKore
74
62
  # - Yunibaasaru・Sutajio・Japan
75
63
  # Keep numbers next to kanji/kana, else the below kana won't make sense:
76
64
  # - Word { kanji: 20日, kana: はつか }
77
-
65
+
78
66
  str = str.gsub(/[^[[:alnum:]]・]/,'')
79
-
67
+
80
68
  # Numbers/dots by themselves (without kanji/kana) should be ignored (empty).
81
- str = '' if str.gsub(/[[[:digit:]]・]+/,'').empty?()
82
-
69
+ str = '' if str.gsub(/[[[:digit:]]・]+/,'').empty?
70
+
83
71
  return str
84
72
  end
85
73
  end
86
-
74
+
87
75
  ###
88
- # @author Jonathan Bradley Whited (@esotericpig)
76
+ # @author Jonathan Bradley Whited
89
77
  # @since 0.2.0
90
78
  ###
91
79
  class BestPolisher < BasicPolisher
@@ -1,23 +1,11 @@
1
- #!/usr/bin/env ruby
2
1
  # encoding: UTF-8
3
2
  # frozen_string_literal: true
4
3
 
5
4
  #--
6
5
  # This file is part of NHKore.
7
- # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
- #
9
- # NHKore is free software: you can redistribute it and/or modify
10
- # it under the terms of the GNU Lesser General Public License as published by
11
- # the Free Software Foundation, either version 3 of the License, or
12
- # (at your option) any later version.
13
- #
14
- # NHKore is distributed in the hope that it will be useful,
15
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
- # GNU Lesser General Public License for more details.
18
- #
19
- # You should have received a copy of the GNU Lesser General Public License
20
- # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
6
+ # Copyright (c) 2020-2021 Jonathan Bradley Whited
7
+ #
8
+ # SPDX-License-Identifier: LGPL-3.0-or-later
21
9
  #++
22
10
 
23
11
 
@@ -31,18 +19,19 @@ require 'nhkore/util'
31
19
 
32
20
  module NHKore
33
21
  ###
34
- # @author Jonathan Bradley Whited (@esotericpig)
22
+ # @author Jonathan Bradley Whited
35
23
  # @since 0.2.0
36
24
  ###
37
25
  class Scraper
38
26
  extend AttrBool::Ext
39
-
27
+
40
28
  DEFAULT_HEADER = {
41
- 'user-agent' => UserAgents.sample(),
42
- 'accept' => 'text/html,application/xhtml+xml,application/xml,application/rss+xml,text/xml;image/webp,image/apng,*/*;application/signed-exchange',
29
+ 'user-agent' => UserAgents.sample,
30
+ 'accept' => 'text/html,application/xhtml+xml,application/xml,application/rss+xml,text/xml;image/webp' \
31
+ ',image/apng,*/*;application/signed-exchange',
43
32
  'dnt' => '1',
44
- }
45
-
33
+ }.freeze
34
+
46
35
  attr_accessor? :eat_cookie
47
36
  attr_accessor? :is_file
48
37
  attr_reader :kargs
@@ -51,80 +40,81 @@ module NHKore
51
40
  attr_accessor :redirect_rule
52
41
  attr_accessor :str_or_io
53
42
  attr_accessor :url
54
-
43
+
55
44
  # +max_redirects+ defaults to 3 for safety (infinite-loop attack).
56
- #
45
+ #
57
46
  # All URL options: https://ruby-doc.org/stdlib-2.7.0/libdoc/open-uri/rdoc/OpenURI/OpenRead.html
58
- #
47
+ #
59
48
  # Pass in +header: {}+ for the default HTTP header fields to be set.
60
- #
49
+ #
61
50
  # @param eat_cookie [true,false] true to set the HTTP header field 'cookie', which can be an expensive
62
51
  # (time-consuming) operation since it opens the URL again, but necessary for some URLs.
63
52
  # @param redirect_rule [nil,:lenient,:strict]
64
- def initialize(url,eat_cookie: false,header: nil,is_file: false,max_redirects: 3,max_retries: 3,redirect_rule: :strict,str_or_io: nil,**kargs)
53
+ def initialize(url,eat_cookie: false,header: nil,is_file: false,max_redirects: 3,max_retries: 3,
54
+ redirect_rule: :strict,str_or_io: nil,**kargs)
65
55
  super()
66
-
67
- if !header.nil?() && !is_file
56
+
57
+ if !header.nil? && !is_file
68
58
  # Some sites (Search Engines) hate scrapers, so need HTTP header fields.
69
59
  # If this isn't enough, look at googler for more header fields to set:
70
60
  # - https://github.com/jarun/googler
71
61
  # If necessary, can use Faraday, HTTParty, or RestClient gem and
72
62
  # pass in to str_or_io.
73
-
63
+
74
64
  header = DEFAULT_HEADER.merge(header)
75
65
  kargs.merge!(header)
76
66
  end
77
-
67
+
78
68
  @eat_cookie = eat_cookie
79
69
  @is_file = is_file
80
70
  @kargs = kargs
81
71
  @max_redirects = max_redirects
82
72
  @max_retries = max_retries
83
73
  @redirect_rule = redirect_rule
84
-
85
- open(url,str_or_io,is_file: is_file)
74
+
75
+ self.open(url,str_or_io,is_file: is_file)
86
76
  end
87
-
77
+
88
78
  def fetch_cookie(url)
89
79
  require 'http-cookie'
90
-
80
+
91
81
  open_url(url)
92
-
82
+
93
83
  cookies = Array(@str_or_io.meta['set-cookie']) # nil will be []
94
-
95
- if !cookies.empty?()
96
- jar = HTTP::CookieJar.new()
84
+
85
+ if !cookies.empty?
86
+ jar = HTTP::CookieJar.new
97
87
  uri = URI(url)
98
-
99
- cookies.each() do |cookie|
88
+
89
+ cookies.each do |cookie|
100
90
  jar.parse(cookie,uri)
101
91
  end
102
-
92
+
103
93
  @kargs['cookie'] = HTTP::Cookie.cookie_value(jar.cookies(uri))
104
94
  end
105
-
95
+
106
96
  return self
107
97
  end
108
-
109
- def html_doc()
98
+
99
+ def html_doc
110
100
  return Nokogiri::HTML(@str_or_io)
111
101
  end
112
-
102
+
113
103
  def join_url(relative_url)
114
104
  # For a file, don't know what to do.
115
105
  # It would be unsafe to return something else;
116
106
  # for example, it could return a lot of "../../../" to your root dir.
117
107
  return nil if @is_file
118
-
119
- return URI::join(@url,relative_url)
108
+
109
+ return URI.join(@url,relative_url)
120
110
  end
121
-
111
+
122
112
  def open(url,str_or_io=nil,is_file: @is_file)
123
113
  @is_file = is_file
124
114
  @str_or_io = str_or_io
125
115
  @url = url
126
-
127
- if str_or_io.nil?()
116
+
117
+ if str_or_io.nil?
128
118
  if @is_file
129
119
  open_file(url)
130
120
  else
@@ -132,85 +122,86 @@ module NHKore
132
122
  open_url(url)
133
123
  end
134
124
  end
135
-
125
+
136
126
  return self
137
127
  end
138
-
128
+
139
129
  def open_file(file)
140
130
  @is_file = true
141
131
  @url = file
142
-
132
+
143
133
  # NHK's website tends to always use UTF-8.
144
134
  @str_or_io = File.open(file,'rt:UTF-8',**@kargs)
145
-
135
+
146
136
  return self
147
137
  end
148
-
138
+
149
139
  def open_url(url)
150
- max_redirects = (@max_redirects.nil?() || @max_redirects < 0) ? 10_000 : @max_redirects
151
- max_retries = (@max_retries.nil?() || @max_retries < 0) ? 10_000 : @max_retries
152
-
140
+ max_redirects = (@max_redirects.nil? || @max_redirects < 0) ? 10_000 : @max_redirects
141
+ max_retries = (@max_retries.nil? || @max_retries < 0) ? 10_000 : @max_retries
142
+
153
143
  top_uri = URI(url)
154
144
  top_domain = Util.domain(top_uri.host)
155
-
145
+
156
146
  begin
157
- # Use URI.open() instead of (Kernel.)open() for safety (code-injection attack).
158
- @str_or_io = URI.open(url,redirect: false,**@kargs)
147
+ # Use URI().open() instead of URI.open()/(Kernel.)open() for safety (code-injection attack).
148
+ # Use URI() instead of URI.parse() because url can be a URI (not just a string).
149
+ @str_or_io = URI(url).open(redirect: false,**@kargs)
159
150
  @url = url
160
151
  rescue OpenURI::HTTPRedirect => redirect
161
152
  redirect_uri = redirect.uri
162
-
153
+
163
154
  if (max_redirects -= 1) < 0
164
155
  raise redirect.exception("redirected to URL[#{redirect_uri}]: #{redirect}")
165
156
  end
166
-
157
+
167
158
  case @redirect_rule
168
159
  when :lenient,:strict
169
160
  if redirect_uri.scheme != top_uri.scheme
170
- raise redirect.exception("redirect scheme[#{redirect_uri.scheme}] does not match original " +
161
+ raise redirect.exception("redirect scheme[#{redirect_uri.scheme}] does not match original " \
171
162
  "scheme[#{top_uri.scheme}] at redirect URL[#{redirect_uri}]: #{redirect}")
172
163
  end
173
-
164
+
174
165
  if @redirect_rule == :strict
175
166
  redirect_domain = Util.domain(redirect_uri.host)
176
-
167
+
177
168
  if redirect_domain != top_domain
178
- raise redirect.exception("redirect domain[#{redirect_domain}] does not match original " +
169
+ raise redirect.exception("redirect domain[#{redirect_domain}] does not match original " \
179
170
  "domain[#{top_domain}] at redirect URL[#{redirect_uri}]: #{redirect}")
180
171
  end
181
172
  end
182
173
  end
183
-
174
+
184
175
  url = redirect_uri
185
-
176
+
186
177
  retry
187
178
  # Must come after HTTPRedirect since a subclass of HTTPError.
188
179
  rescue OpenURI::HTTPError => e
189
- raise e.exception("HTTP error[#{e.to_s()}] at URL[#{url}]")
180
+ raise e.exception("HTTP error[#{e}] at URL[#{url}]")
190
181
  rescue SocketError => e
191
182
  if (max_retries -= 1) < 0
192
- raise e.exception("Socket error[#{e.to_s()}] at URL[#{url}]")
183
+ raise e.exception("Socket error[#{e}] at URL[#{url}]")
193
184
  end
194
-
185
+
195
186
  retry
196
187
  end
197
-
188
+
198
189
  return self
199
190
  end
200
-
201
- def read()
202
- @str_or_io = @str_or_io.read() if @str_or_io.respond_to?(:read)
203
-
191
+
192
+ def read
193
+ @str_or_io = @str_or_io.read if @str_or_io.respond_to?(:read)
194
+
204
195
  return @str_or_io
205
196
  end
206
-
207
- def reopen()
208
- return open(@url)
197
+
198
+ def reopen
199
+ return self.open(@url)
209
200
  end
210
-
211
- def rss_doc()
201
+
202
+ def rss_doc
212
203
  require 'rss'
213
-
204
+
214
205
  return RSS::Parser.parse(@str_or_io,validate: false)
215
206
  end
216
207
  end