nhkore 0.3.4 → 0.3.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,23 +1,11 @@
1
- #!/usr/bin/env ruby
2
1
  # encoding: UTF-8
3
2
  # frozen_string_literal: true
4
3
 
5
4
  #--
6
5
  # This file is part of NHKore.
7
- # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
- #
9
- # NHKore is free software: you can redistribute it and/or modify
10
- # it under the terms of the GNU Lesser General Public License as published by
11
- # the Free Software Foundation, either version 3 of the License, or
12
- # (at your option) any later version.
13
- #
14
- # NHKore is distributed in the hope that it will be useful,
15
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
- # GNU Lesser General Public License for more details.
18
- #
19
- # You should have received a copy of the GNU Lesser General Public License
20
- # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
6
+ # Copyright (c) 2020-2021 Jonathan Bradley Whited
7
+ #
8
+ # SPDX-License-Identifier: LGPL-3.0-or-later
21
9
  #++
22
10
 
23
11
 
@@ -31,16 +19,19 @@ require 'nhkore/util'
31
19
 
32
20
  module NHKore
33
21
  ###
34
- # @author Jonathan Bradley Whited (@esotericpig)
22
+ # @author Jonathan Bradley Whited
35
23
  # @since 0.2.0
36
24
  ###
37
25
  class Scraper
26
+ extend AttrBool::Ext
27
+
38
28
  DEFAULT_HEADER = {
39
- 'user-agent' => UserAgents.sample(),
40
- 'accept' => 'text/html,application/xhtml+xml,application/xml,application/rss+xml,text/xml;image/webp,image/apng,*/*;application/signed-exchange',
29
+ 'user-agent' => UserAgents.sample,
30
+ 'accept' => 'text/html,application/xhtml+xml,application/xml,application/rss+xml,text/xml;image/webp' \
31
+ ',image/apng,*/*;application/signed-exchange',
41
32
  'dnt' => '1',
42
- }
43
-
33
+ }.freeze
34
+
44
35
  attr_accessor? :eat_cookie
45
36
  attr_accessor? :is_file
46
37
  attr_reader :kargs
@@ -49,80 +40,81 @@ module NHKore
49
40
  attr_accessor :redirect_rule
50
41
  attr_accessor :str_or_io
51
42
  attr_accessor :url
52
-
43
+
53
44
  # +max_redirects+ defaults to 3 for safety (infinite-loop attack).
54
- #
45
+ #
55
46
  # All URL options: https://ruby-doc.org/stdlib-2.7.0/libdoc/open-uri/rdoc/OpenURI/OpenRead.html
56
- #
47
+ #
57
48
  # Pass in +header: {}+ for the default HTTP header fields to be set.
58
- #
49
+ #
59
50
  # @param eat_cookie [true,false] true to set the HTTP header field 'cookie', which can be an expensive
60
51
  # (time-consuming) operation since it opens the URL again, but necessary for some URLs.
61
52
  # @param redirect_rule [nil,:lenient,:strict]
62
- def initialize(url,eat_cookie: false,header: nil,is_file: false,max_redirects: 3,max_retries: 3,redirect_rule: :strict,str_or_io: nil,**kargs)
53
+ def initialize(url,eat_cookie: false,header: nil,is_file: false,max_redirects: 3,max_retries: 3,
54
+ redirect_rule: :strict,str_or_io: nil,**kargs)
63
55
  super()
64
-
65
- if !header.nil?() && !is_file
56
+
57
+ if !header.nil? && !is_file
66
58
  # Some sites (Search Engines) hate scrapers, so need HTTP header fields.
67
59
  # If this isn't enough, look at googler for more header fields to set:
68
60
  # - https://github.com/jarun/googler
69
61
  # If necessary, can use Faraday, HTTParty, or RestClient gem and
70
62
  # pass in to str_or_io.
71
-
63
+
72
64
  header = DEFAULT_HEADER.merge(header)
73
65
  kargs.merge!(header)
74
66
  end
75
-
67
+
76
68
  @eat_cookie = eat_cookie
77
69
  @is_file = is_file
78
70
  @kargs = kargs
79
71
  @max_redirects = max_redirects
80
72
  @max_retries = max_retries
81
73
  @redirect_rule = redirect_rule
82
-
83
- open(url,str_or_io,is_file: is_file)
74
+
75
+ self.open(url,str_or_io,is_file: is_file)
84
76
  end
85
-
77
+
86
78
  def fetch_cookie(url)
87
79
  require 'http-cookie'
88
-
80
+
89
81
  open_url(url)
90
-
82
+
91
83
  cookies = Array(@str_or_io.meta['set-cookie']) # nil will be []
92
-
93
- if !cookies.empty?()
94
- jar = HTTP::CookieJar.new()
84
+
85
+ if !cookies.empty?
86
+ jar = HTTP::CookieJar.new
95
87
  uri = URI(url)
96
-
97
- cookies.each() do |cookie|
88
+
89
+ cookies.each do |cookie|
98
90
  jar.parse(cookie,uri)
99
91
  end
100
-
92
+
101
93
  @kargs['cookie'] = HTTP::Cookie.cookie_value(jar.cookies(uri))
102
94
  end
103
-
95
+
104
96
  return self
105
97
  end
106
-
107
- def html_doc()
98
+
99
+ def html_doc
108
100
  return Nokogiri::HTML(@str_or_io)
109
101
  end
110
-
102
+
111
103
  def join_url(relative_url)
112
104
  # For a file, don't know what to do.
113
105
  # It would be unsafe to return something else;
114
106
  # for example, it could return a lot of "../../../" to your root dir.
115
107
  return nil if @is_file
116
-
117
- return URI::join(@url,relative_url)
108
+
109
+ return URI.join(@url,relative_url)
118
110
  end
119
-
111
+
120
112
  def open(url,str_or_io=nil,is_file: @is_file)
121
113
  @is_file = is_file
122
114
  @str_or_io = str_or_io
123
115
  @url = url
124
-
125
- if str_or_io.nil?()
116
+
117
+ if str_or_io.nil?
126
118
  if @is_file
127
119
  open_file(url)
128
120
  else
@@ -130,85 +122,86 @@ module NHKore
130
122
  open_url(url)
131
123
  end
132
124
  end
133
-
125
+
134
126
  return self
135
127
  end
136
-
128
+
137
129
  def open_file(file)
138
130
  @is_file = true
139
131
  @url = file
140
-
132
+
141
133
  # NHK's website tends to always use UTF-8.
142
134
  @str_or_io = File.open(file,'rt:UTF-8',**@kargs)
143
-
135
+
144
136
  return self
145
137
  end
146
-
138
+
147
139
  def open_url(url)
148
- max_redirects = (@max_redirects.nil?() || @max_redirects < 0) ? 10_000 : @max_redirects
149
- max_retries = (@max_retries.nil?() || @max_retries < 0) ? 10_000 : @max_retries
150
-
140
+ max_redirects = (@max_redirects.nil? || @max_redirects < 0) ? 10_000 : @max_redirects
141
+ max_retries = (@max_retries.nil? || @max_retries < 0) ? 10_000 : @max_retries
142
+
151
143
  top_uri = URI(url)
152
144
  top_domain = Util.domain(top_uri.host)
153
-
145
+
154
146
  begin
155
- # Use URI.open() instead of (Kernel.)open() for safety (code-injection attack).
156
- @str_or_io = URI.open(url,redirect: false,**@kargs)
147
+ # Use URI().open() instead of URI.open()/(Kernel.)open() for safety (code-injection attack).
148
+ # Use URI() instead of URI.parse() because url can be a URI (not just a string).
149
+ @str_or_io = URI(url).open(redirect: false,**@kargs)
157
150
  @url = url
158
151
  rescue OpenURI::HTTPRedirect => redirect
159
152
  redirect_uri = redirect.uri
160
-
153
+
161
154
  if (max_redirects -= 1) < 0
162
155
  raise redirect.exception("redirected to URL[#{redirect_uri}]: #{redirect}")
163
156
  end
164
-
157
+
165
158
  case @redirect_rule
166
159
  when :lenient,:strict
167
160
  if redirect_uri.scheme != top_uri.scheme
168
- raise redirect.exception("redirect scheme[#{redirect_uri.scheme}] does not match original " +
161
+ raise redirect.exception("redirect scheme[#{redirect_uri.scheme}] does not match original " \
169
162
  "scheme[#{top_uri.scheme}] at redirect URL[#{redirect_uri}]: #{redirect}")
170
163
  end
171
-
164
+
172
165
  if @redirect_rule == :strict
173
166
  redirect_domain = Util.domain(redirect_uri.host)
174
-
167
+
175
168
  if redirect_domain != top_domain
176
- raise redirect.exception("redirect domain[#{redirect_domain}] does not match original " +
169
+ raise redirect.exception("redirect domain[#{redirect_domain}] does not match original " \
177
170
  "domain[#{top_domain}] at redirect URL[#{redirect_uri}]: #{redirect}")
178
171
  end
179
172
  end
180
173
  end
181
-
174
+
182
175
  url = redirect_uri
183
-
176
+
184
177
  retry
185
178
  # Must come after HTTPRedirect since a subclass of HTTPError.
186
179
  rescue OpenURI::HTTPError => e
187
- raise e.exception("HTTP error[#{e.to_s()}] at URL[#{url}]")
180
+ raise e.exception("HTTP error[#{e}] at URL[#{url}]")
188
181
  rescue SocketError => e
189
182
  if (max_retries -= 1) < 0
190
- raise e.exception("Socket error[#{e.to_s()}] at URL[#{url}]")
183
+ raise e.exception("Socket error[#{e}] at URL[#{url}]")
191
184
  end
192
-
185
+
193
186
  retry
194
187
  end
195
-
188
+
196
189
  return self
197
190
  end
198
-
199
- def read()
200
- @str_or_io = @str_or_io.read() if @str_or_io.respond_to?(:read)
201
-
191
+
192
+ def read
193
+ @str_or_io = @str_or_io.read if @str_or_io.respond_to?(:read)
194
+
202
195
  return @str_or_io
203
196
  end
204
-
205
- def reopen()
206
- return open(@url)
197
+
198
+ def reopen
199
+ return self.open(@url)
207
200
  end
208
-
209
- def rss_doc()
201
+
202
+ def rss_doc
210
203
  require 'rss'
211
-
204
+
212
205
  return RSS::Parser.parse(@str_or_io,validate: false)
213
206
  end
214
207
  end
@@ -1,23 +1,11 @@
1
- #!/usr/bin/env ruby
2
1
  # encoding: UTF-8
3
2
  # frozen_string_literal: true
4
3
 
5
4
  #--
6
5
  # This file is part of NHKore.
7
- # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
- #
9
- # NHKore is free software: you can redistribute it and/or modify
10
- # it under the terms of the GNU Lesser General Public License as published by
11
- # the Free Software Foundation, either version 3 of the License, or
12
- # (at your option) any later version.
13
- #
14
- # NHKore is distributed in the hope that it will be useful,
15
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
- # GNU Lesser General Public License for more details.
18
- #
19
- # You should have received a copy of the GNU Lesser General Public License
20
- # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
6
+ # Copyright (c) 2020-2021 Jonathan Bradley Whited
7
+ #
8
+ # SPDX-License-Identifier: LGPL-3.0-or-later
21
9
  #++
22
10
 
23
11
 
@@ -30,70 +18,88 @@ require 'nhkore/util'
30
18
 
31
19
  module NHKore
32
20
  ###
33
- # @author Jonathan Bradley Whited (@esotericpig)
21
+ # @author Jonathan Bradley Whited
34
22
  # @since 0.2.0
35
23
  ###
36
24
  class SearchLink
37
- attr_accessor :datetime
38
- attr_accessor :futsuurl
25
+ extend AttrBool::Ext
26
+
27
+ attr_reader :datetime
28
+ attr_reader :futsuurl
39
29
  attr_accessor? :scraped
40
30
  attr_accessor :sha256
41
31
  attr_accessor :title
42
- attr_accessor :url
43
-
32
+ attr_reader :url
33
+
44
34
  def initialize(url,scraped: false)
45
35
  super()
46
-
36
+
47
37
  @datetime = nil
48
38
  @futsuurl = nil
49
39
  @scraped = scraped
50
40
  @sha256 = sha256
51
41
  @title = nil
52
- @url = url
42
+ self.url = url
53
43
  end
54
-
44
+
55
45
  def encode_with(coder)
56
46
  # Order matters.
57
-
58
- coder[:url] = @url
47
+
48
+ coder[:url] = @url.nil? ? nil : @url.to_s
59
49
  coder[:scraped] = @scraped
60
- coder[:datetime] = @datetime.nil?() ? @datetime : @datetime.iso8601()
50
+ coder[:datetime] = @datetime.nil? ? nil : @datetime.iso8601
61
51
  coder[:title] = @title
62
- coder[:futsuurl] = @futsuurl
52
+ coder[:futsuurl] = @futsuurl.nil? ? nil : @futsuurl.to_s
63
53
  coder[:sha256] = @sha256
64
54
  end
65
-
55
+
66
56
  def self.load_data(key,hash)
67
- datetime = hash[:datetime]
68
-
69
57
  slink = SearchLink.new(
70
58
  hash[:url],
71
- scraped: hash[:scraped]
59
+ scraped: hash[:scraped],
72
60
  )
73
-
74
- slink.datetime = Util.empty_web_str?(datetime) ? nil : Time.iso8601(datetime)
61
+
62
+ slink.datetime = hash[:datetime]
75
63
  slink.futsuurl = hash[:futsuurl]
76
64
  slink.sha256 = hash[:sha256]
77
65
  slink.title = hash[:title]
78
-
66
+
79
67
  return slink
80
68
  end
81
-
69
+
82
70
  def update_from_article(article)
83
71
  # Don't update the url, as it may be different (e.g., http vs https).
84
-
85
- @datetime = article.datetime if @datetime.nil?()
86
- @futsuurl = article.futsuurl if Util.empty_web_str?(@futsuurl)
72
+
73
+ self.datetime = article.datetime if @datetime.nil?
74
+ self.futsuurl = article.futsuurl if Util.empty_web_str?(@futsuurl)
87
75
  @scraped = true # If we have an article, it's been scraped
88
76
  @sha256 = article.sha256 if Util.empty_web_str?(@sha256)
89
77
  @title = article.title if Util.empty_web_str?(@title)
90
78
  end
91
-
79
+
80
+ def datetime=(value)
81
+ if value.is_a?(Time)
82
+ @datetime = value
83
+ else
84
+ @datetime = Util.empty_web_str?(value) ? nil : Time.iso8601(value)
85
+ end
86
+ end
87
+
88
+ def futsuurl=(value)
89
+ # Don't store URI, store String.
90
+ @futsuurl = value.nil? ? nil : value.to_s
91
+ end
92
+
93
+ def url=(value)
94
+ # Don't store URI, store String.
95
+ @url = value.nil? ? nil : value.to_s
96
+ end
97
+
92
98
  def to_s(mini: false)
93
- s = ''.dup()
94
-
99
+ s = ''.dup
100
+
95
101
  s << "'#{@url}': "
96
-
102
+
97
103
  if mini
98
104
  s << "{ scraped? #{@scraped ? 'yes' : 'NO'} }"
99
105
  else
@@ -103,84 +109,85 @@ module NHKore
103
109
  s << "\n futsuurl: '#{@futsuurl}'"
104
110
  s << "\n sha256: '#{@sha256}'"
105
111
  end
106
-
112
+
107
113
  return s
108
114
  end
109
115
  end
110
-
116
+
111
117
  ###
112
- # @author Jonathan Bradley Whited (@esotericpig)
118
+ # @author Jonathan Bradley Whited
113
119
  # @since 0.2.0
114
120
  ###
115
121
  class SearchLinks
116
122
  include Fileable
117
-
123
+
118
124
  DEFAULT_DIR = Util::CORE_DIR
119
-
125
+
120
126
  DEFAULT_FUTSUU_FILENAME = 'links_nhk_news_web_regular.yml'
121
127
  DEFAULT_YASASHII_FILENAME = 'links_nhk_news_web_easy.yml'
122
-
128
+
123
129
  def self.build_file(filename)
124
130
  return File.join(DEFAULT_DIR,filename)
125
131
  end
126
-
132
+
127
133
  DEFAULT_FUTSUU_FILE = build_file(DEFAULT_FUTSUU_FILENAME)
128
134
  DEFAULT_YASASHII_FILE = build_file(DEFAULT_YASASHII_FILENAME)
129
-
135
+
130
136
  attr_reader :links
131
-
132
- def initialize()
137
+
138
+ def initialize
133
139
  super()
134
-
140
+
135
141
  @links = {}
136
142
  end
137
-
143
+
138
144
  def add_link(link)
139
- return self if @links.key?(link.url)
140
-
141
- @links[link.url] = link
142
-
145
+ url = link.url.nil? ? nil : link.url.to_s
146
+
147
+ return self if @links.key?(url)
148
+
149
+ @links[url] = link
150
+
143
151
  return self
144
152
  end
145
-
153
+
146
154
  def each(&block)
147
155
  return @links.each(&block)
148
156
  end
149
-
157
+
150
158
  def encode_with(coder)
151
159
  # Order matters.
152
-
160
+
153
161
  coder[:links] = @links
154
162
  end
155
-
163
+
156
164
  def self.load_data(data,file: nil,**kargs)
157
165
  data = Util.load_yaml(data,file: file)
158
-
166
+
159
167
  links = data[:links]
160
-
161
- slinks = SearchLinks.new()
162
-
163
- if !links.nil?()
164
- links.each() do |key,hash|
165
- key = key.to_s() # Change from a symbol
166
- slinks.links[key] = SearchLink.load_data(key,hash)
167
- end
168
+
169
+ slinks = SearchLinks.new
170
+
171
+ links&.each() do |key,hash|
172
+ key = key.to_s unless key.nil?
173
+ slinks.links[key] = SearchLink.load_data(key,hash)
168
174
  end
169
-
175
+
170
176
  return slinks
171
177
  end
172
-
178
+
173
179
  def [](url)
174
180
  url = url.url if url.respond_to?(:url)
175
-
181
+ url = url.to_s unless url.nil?
182
+
176
183
  return @links[url]
177
184
  end
178
-
179
- def length()
185
+
186
+ def length
180
187
  return @links.length
181
188
  end
182
-
183
- def to_s()
189
+
190
+ def to_s
184
191
  return Util.dump_yaml(self)
185
192
  end
186
193
  end