nhkore 0.3.7 → 0.3.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,23 +1,11 @@
1
- #!/usr/bin/env ruby
2
1
  # encoding: UTF-8
3
2
  # frozen_string_literal: true
4
3
 
5
4
  #--
6
5
  # This file is part of NHKore.
7
- # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
- #
9
- # NHKore is free software: you can redistribute it and/or modify
10
- # it under the terms of the GNU Lesser General Public License as published by
11
- # the Free Software Foundation, either version 3 of the License, or
12
- # (at your option) any later version.
13
- #
14
- # NHKore is distributed in the hope that it will be useful,
15
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
- # GNU Lesser General Public License for more details.
18
- #
19
- # You should have received a copy of the GNU Lesser General Public License
20
- # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
6
+ # Copyright (c) 2020-2021 Jonathan Bradley Whited
7
+ #
8
+ # SPDX-License-Identifier: LGPL-3.0-or-later
21
9
  #++
22
10
 
23
11
 
@@ -31,18 +19,19 @@ require 'nhkore/util'
31
19
 
32
20
  module NHKore
33
21
  ###
34
- # @author Jonathan Bradley Whited (@esotericpig)
22
+ # @author Jonathan Bradley Whited
35
23
  # @since 0.2.0
36
24
  ###
37
25
  class Scraper
38
26
  extend AttrBool::Ext
39
-
27
+
40
28
  DEFAULT_HEADER = {
41
- 'user-agent' => UserAgents.sample(),
42
- 'accept' => 'text/html,application/xhtml+xml,application/xml,application/rss+xml,text/xml;image/webp,image/apng,*/*;application/signed-exchange',
29
+ 'user-agent' => UserAgents.sample,
30
+ 'accept' => 'text/html,application/xhtml+xml,application/xml,application/rss+xml,text/xml;image/webp' \
31
+ ',image/apng,*/*;application/signed-exchange',
43
32
  'dnt' => '1',
44
- }
45
-
33
+ }.freeze
34
+
46
35
  attr_accessor? :eat_cookie
47
36
  attr_accessor? :is_file
48
37
  attr_reader :kargs
@@ -51,80 +40,81 @@ module NHKore
51
40
  attr_accessor :redirect_rule
52
41
  attr_accessor :str_or_io
53
42
  attr_accessor :url
54
-
43
+
55
44
  # +max_redirects+ defaults to 3 for safety (infinite-loop attack).
56
- #
45
+ #
57
46
  # All URL options: https://ruby-doc.org/stdlib-2.7.0/libdoc/open-uri/rdoc/OpenURI/OpenRead.html
58
- #
47
+ #
59
48
  # Pass in +header: {}+ for the default HTTP header fields to be set.
60
- #
49
+ #
61
50
  # @param eat_cookie [true,false] true to set the HTTP header field 'cookie', which can be an expensive
62
51
  # (time-consuming) operation since it opens the URL again, but necessary for some URLs.
63
52
  # @param redirect_rule [nil,:lenient,:strict]
64
- def initialize(url,eat_cookie: false,header: nil,is_file: false,max_redirects: 3,max_retries: 3,redirect_rule: :strict,str_or_io: nil,**kargs)
53
+ def initialize(url,eat_cookie: false,header: nil,is_file: false,max_redirects: 3,max_retries: 3,
54
+ redirect_rule: :strict,str_or_io: nil,**kargs)
65
55
  super()
66
-
67
- if !header.nil?() && !is_file
56
+
57
+ if !header.nil? && !is_file
68
58
  # Some sites (Search Engines) hate scrapers, so need HTTP header fields.
69
59
  # If this isn't enough, look at googler for more header fields to set:
70
60
  # - https://github.com/jarun/googler
71
61
  # If necessary, can use Faraday, HTTParty, or RestClient gem and
72
62
  # pass in to str_or_io.
73
-
63
+
74
64
  header = DEFAULT_HEADER.merge(header)
75
65
  kargs.merge!(header)
76
66
  end
77
-
67
+
78
68
  @eat_cookie = eat_cookie
79
69
  @is_file = is_file
80
70
  @kargs = kargs
81
71
  @max_redirects = max_redirects
82
72
  @max_retries = max_retries
83
73
  @redirect_rule = redirect_rule
84
-
85
- open(url,str_or_io,is_file: is_file)
74
+
75
+ self.open(url,str_or_io,is_file: is_file)
86
76
  end
87
-
77
+
88
78
  def fetch_cookie(url)
89
79
  require 'http-cookie'
90
-
80
+
91
81
  open_url(url)
92
-
82
+
93
83
  cookies = Array(@str_or_io.meta['set-cookie']) # nil will be []
94
-
95
- if !cookies.empty?()
96
- jar = HTTP::CookieJar.new()
84
+
85
+ if !cookies.empty?
86
+ jar = HTTP::CookieJar.new
97
87
  uri = URI(url)
98
-
99
- cookies.each() do |cookie|
88
+
89
+ cookies.each do |cookie|
100
90
  jar.parse(cookie,uri)
101
91
  end
102
-
92
+
103
93
  @kargs['cookie'] = HTTP::Cookie.cookie_value(jar.cookies(uri))
104
94
  end
105
-
95
+
106
96
  return self
107
97
  end
108
-
109
- def html_doc()
98
+
99
+ def html_doc
110
100
  return Nokogiri::HTML(@str_or_io)
111
101
  end
112
-
102
+
113
103
  def join_url(relative_url)
114
104
  # For a file, don't know what to do.
115
105
  # It would be unsafe to return something else;
116
106
  # for example, it could return a lot of "../../../" to your root dir.
117
107
  return nil if @is_file
118
-
119
- return URI::join(@url,relative_url)
108
+
109
+ return URI.join(@url,relative_url)
120
110
  end
121
-
111
+
122
112
  def open(url,str_or_io=nil,is_file: @is_file)
123
113
  @is_file = is_file
124
114
  @str_or_io = str_or_io
125
115
  @url = url
126
-
127
- if str_or_io.nil?()
116
+
117
+ if str_or_io.nil?
128
118
  if @is_file
129
119
  open_file(url)
130
120
  else
@@ -132,85 +122,86 @@ module NHKore
132
122
  open_url(url)
133
123
  end
134
124
  end
135
-
125
+
136
126
  return self
137
127
  end
138
-
128
+
139
129
  def open_file(file)
140
130
  @is_file = true
141
131
  @url = file
142
-
132
+
143
133
  # NHK's website tends to always use UTF-8.
144
134
  @str_or_io = File.open(file,'rt:UTF-8',**@kargs)
145
-
135
+
146
136
  return self
147
137
  end
148
-
138
+
149
139
  def open_url(url)
150
- max_redirects = (@max_redirects.nil?() || @max_redirects < 0) ? 10_000 : @max_redirects
151
- max_retries = (@max_retries.nil?() || @max_retries < 0) ? 10_000 : @max_retries
152
-
140
+ max_redirects = (@max_redirects.nil? || @max_redirects < 0) ? 10_000 : @max_redirects
141
+ max_retries = (@max_retries.nil? || @max_retries < 0) ? 10_000 : @max_retries
142
+
153
143
  top_uri = URI(url)
154
144
  top_domain = Util.domain(top_uri.host)
155
-
145
+
156
146
  begin
157
- # Use URI.open() instead of (Kernel.)open() for safety (code-injection attack).
158
- @str_or_io = URI.open(url,redirect: false,**@kargs)
147
+ # Use URI().open() instead of URI.open()/(Kernel.)open() for safety (code-injection attack).
148
+ # Use URI() instead of URI.parse() because url can be a URI (not just a string).
149
+ @str_or_io = URI(url).open(redirect: false,**@kargs)
159
150
  @url = url
160
151
  rescue OpenURI::HTTPRedirect => redirect
161
152
  redirect_uri = redirect.uri
162
-
153
+
163
154
  if (max_redirects -= 1) < 0
164
155
  raise redirect.exception("redirected to URL[#{redirect_uri}]: #{redirect}")
165
156
  end
166
-
157
+
167
158
  case @redirect_rule
168
159
  when :lenient,:strict
169
160
  if redirect_uri.scheme != top_uri.scheme
170
- raise redirect.exception("redirect scheme[#{redirect_uri.scheme}] does not match original " +
161
+ raise redirect.exception("redirect scheme[#{redirect_uri.scheme}] does not match original " \
171
162
  "scheme[#{top_uri.scheme}] at redirect URL[#{redirect_uri}]: #{redirect}")
172
163
  end
173
-
164
+
174
165
  if @redirect_rule == :strict
175
166
  redirect_domain = Util.domain(redirect_uri.host)
176
-
167
+
177
168
  if redirect_domain != top_domain
178
- raise redirect.exception("redirect domain[#{redirect_domain}] does not match original " +
169
+ raise redirect.exception("redirect domain[#{redirect_domain}] does not match original " \
179
170
  "domain[#{top_domain}] at redirect URL[#{redirect_uri}]: #{redirect}")
180
171
  end
181
172
  end
182
173
  end
183
-
174
+
184
175
  url = redirect_uri
185
-
176
+
186
177
  retry
187
178
  # Must come after HTTPRedirect since a subclass of HTTPError.
188
179
  rescue OpenURI::HTTPError => e
189
- raise e.exception("HTTP error[#{e.to_s()}] at URL[#{url}]")
180
+ raise e.exception("HTTP error[#{e}] at URL[#{url}]")
190
181
  rescue SocketError => e
191
182
  if (max_retries -= 1) < 0
192
- raise e.exception("Socket error[#{e.to_s()}] at URL[#{url}]")
183
+ raise e.exception("Socket error[#{e}] at URL[#{url}]")
193
184
  end
194
-
185
+
195
186
  retry
196
187
  end
197
-
188
+
198
189
  return self
199
190
  end
200
-
201
- def read()
202
- @str_or_io = @str_or_io.read() if @str_or_io.respond_to?(:read)
203
-
191
+
192
+ def read
193
+ @str_or_io = @str_or_io.read if @str_or_io.respond_to?(:read)
194
+
204
195
  return @str_or_io
205
196
  end
206
-
207
- def reopen()
208
- return open(@url)
197
+
198
+ def reopen
199
+ return self.open(@url)
209
200
  end
210
-
211
- def rss_doc()
201
+
202
+ def rss_doc
212
203
  require 'rss'
213
-
204
+
214
205
  return RSS::Parser.parse(@str_or_io,validate: false)
215
206
  end
216
207
  end
@@ -1,23 +1,11 @@
1
- #!/usr/bin/env ruby
2
1
  # encoding: UTF-8
3
2
  # frozen_string_literal: true
4
3
 
5
4
  #--
6
5
  # This file is part of NHKore.
7
- # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
- #
9
- # NHKore is free software: you can redistribute it and/or modify
10
- # it under the terms of the GNU Lesser General Public License as published by
11
- # the Free Software Foundation, either version 3 of the License, or
12
- # (at your option) any later version.
13
- #
14
- # NHKore is distributed in the hope that it will be useful,
15
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
- # GNU Lesser General Public License for more details.
18
- #
19
- # You should have received a copy of the GNU Lesser General Public License
20
- # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
6
+ # Copyright (c) 2020-2021 Jonathan Bradley Whited
7
+ #
8
+ # SPDX-License-Identifier: LGPL-3.0-or-later
21
9
  #++
22
10
 
23
11
 
@@ -30,22 +18,22 @@ require 'nhkore/util'
30
18
 
31
19
  module NHKore
32
20
  ###
33
- # @author Jonathan Bradley Whited (@esotericpig)
21
+ # @author Jonathan Bradley Whited
34
22
  # @since 0.2.0
35
23
  ###
36
24
  class SearchLink
37
25
  extend AttrBool::Ext
38
-
26
+
39
27
  attr_reader :datetime
40
28
  attr_reader :futsuurl
41
29
  attr_accessor? :scraped
42
30
  attr_accessor :sha256
43
31
  attr_accessor :title
44
32
  attr_reader :url
45
-
33
+
46
34
  def initialize(url,scraped: false)
47
35
  super()
48
-
36
+
49
37
  @datetime = nil
50
38
  @futsuurl = nil
51
39
  @scraped = scraped
@@ -53,42 +41,42 @@ module NHKore
53
41
  @title = nil
54
42
  self.url = url
55
43
  end
56
-
44
+
57
45
  def encode_with(coder)
58
46
  # Order matters.
59
-
60
- coder[:url] = @url.nil?() ? nil : @url.to_s()
47
+
48
+ coder[:url] = @url.nil? ? nil : @url.to_s
61
49
  coder[:scraped] = @scraped
62
- coder[:datetime] = @datetime.nil?() ? nil : @datetime.iso8601()
50
+ coder[:datetime] = @datetime.nil? ? nil : @datetime.iso8601
63
51
  coder[:title] = @title
64
- coder[:futsuurl] = @futsuurl.nil?() ? nil : @futsuurl.to_s()
52
+ coder[:futsuurl] = @futsuurl.nil? ? nil : @futsuurl.to_s
65
53
  coder[:sha256] = @sha256
66
54
  end
67
-
55
+
68
56
  def self.load_data(key,hash)
69
57
  slink = SearchLink.new(
70
58
  hash[:url],
71
59
  scraped: hash[:scraped],
72
60
  )
73
-
61
+
74
62
  slink.datetime = hash[:datetime]
75
63
  slink.futsuurl = hash[:futsuurl]
76
64
  slink.sha256 = hash[:sha256]
77
65
  slink.title = hash[:title]
78
-
66
+
79
67
  return slink
80
68
  end
81
-
69
+
82
70
  def update_from_article(article)
83
71
  # Don't update the url, as it may be different (e.g., http vs https).
84
-
85
- self.datetime = article.datetime if @datetime.nil?()
72
+
73
+ self.datetime = article.datetime if @datetime.nil?
86
74
  self.futsuurl = article.futsuurl if Util.empty_web_str?(@futsuurl)
87
75
  @scraped = true # If we have an article, it's been scraped
88
76
  @sha256 = article.sha256 if Util.empty_web_str?(@sha256)
89
77
  @title = article.title if Util.empty_web_str?(@title)
90
78
  end
91
-
79
+
92
80
  def datetime=(value)
93
81
  if value.is_a?(Time)
94
82
  @datetime = value
@@ -96,22 +84,22 @@ module NHKore
96
84
  @datetime = Util.empty_web_str?(value) ? nil : Time.iso8601(value)
97
85
  end
98
86
  end
99
-
87
+
100
88
  def futsuurl=(value)
101
89
  # Don't store URI, store String.
102
- @futsuurl = value.nil?() ? nil : value.to_s()
90
+ @futsuurl = value.nil? ? nil : value.to_s
103
91
  end
104
-
92
+
105
93
  def url=(value)
106
94
  # Don't store URI, store String.
107
- @url = value.nil?() ? nil : value.to_s()
95
+ @url = value.nil? ? nil : value.to_s
108
96
  end
109
-
97
+
110
98
  def to_s(mini: false)
111
- s = ''.dup()
112
-
99
+ s = ''.dup
100
+
113
101
  s << "'#{@url}': "
114
-
102
+
115
103
  if mini
116
104
  s << "{ scraped? #{@scraped ? 'yes' : 'NO'} }"
117
105
  else
@@ -121,87 +109,85 @@ module NHKore
121
109
  s << "\n futsuurl: '#{@futsuurl}'"
122
110
  s << "\n sha256: '#{@sha256}'"
123
111
  end
124
-
112
+
125
113
  return s
126
114
  end
127
115
  end
128
-
116
+
129
117
  ###
130
- # @author Jonathan Bradley Whited (@esotericpig)
118
+ # @author Jonathan Bradley Whited
131
119
  # @since 0.2.0
132
120
  ###
133
121
  class SearchLinks
134
122
  include Fileable
135
-
123
+
136
124
  DEFAULT_DIR = Util::CORE_DIR
137
-
125
+
138
126
  DEFAULT_FUTSUU_FILENAME = 'links_nhk_news_web_regular.yml'
139
127
  DEFAULT_YASASHII_FILENAME = 'links_nhk_news_web_easy.yml'
140
-
128
+
141
129
  def self.build_file(filename)
142
130
  return File.join(DEFAULT_DIR,filename)
143
131
  end
144
-
132
+
145
133
  DEFAULT_FUTSUU_FILE = build_file(DEFAULT_FUTSUU_FILENAME)
146
134
  DEFAULT_YASASHII_FILE = build_file(DEFAULT_YASASHII_FILENAME)
147
-
135
+
148
136
  attr_reader :links
149
-
150
- def initialize()
137
+
138
+ def initialize
151
139
  super()
152
-
140
+
153
141
  @links = {}
154
142
  end
155
-
143
+
156
144
  def add_link(link)
157
- url = link.url.nil?() ? nil : link.url.to_s()
158
-
145
+ url = link.url.nil? ? nil : link.url.to_s
146
+
159
147
  return self if @links.key?(url)
160
-
148
+
161
149
  @links[url] = link
162
-
150
+
163
151
  return self
164
152
  end
165
-
153
+
166
154
  def each(&block)
167
155
  return @links.each(&block)
168
156
  end
169
-
157
+
170
158
  def encode_with(coder)
171
159
  # Order matters.
172
-
160
+
173
161
  coder[:links] = @links
174
162
  end
175
-
163
+
176
164
  def self.load_data(data,file: nil,**kargs)
177
165
  data = Util.load_yaml(data,file: file)
178
-
166
+
179
167
  links = data[:links]
180
-
181
- slinks = SearchLinks.new()
182
-
183
- if !links.nil?()
184
- links.each() do |key,hash|
185
- key = key.to_s() unless key.nil?()
186
- slinks.links[key] = SearchLink.load_data(key,hash)
187
- end
168
+
169
+ slinks = SearchLinks.new
170
+
171
+ links&.each() do |key,hash|
172
+ key = key.to_s unless key.nil?
173
+ slinks.links[key] = SearchLink.load_data(key,hash)
188
174
  end
189
-
175
+
190
176
  return slinks
191
177
  end
192
-
178
+
193
179
  def [](url)
194
180
  url = url.url if url.respond_to?(:url)
195
- url = url.to_s() unless url.nil?()
196
-
181
+ url = url.to_s unless url.nil?
182
+
197
183
  return @links[url]
198
184
  end
199
-
200
- def length()
185
+
186
+ def length
201
187
  return @links.length
202
188
  end
203
-
204
- def to_s()
189
+
190
+ def to_s
205
191
  return Util.dump_yaml(self)
206
192
  end
207
193
  end