nhkore 0.3.7 → 0.3.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +53 -2
- data/Gemfile +0 -18
- data/Gemfile.lock +36 -33
- data/README.md +36 -30
- data/Rakefile +38 -52
- data/bin/nhkore +4 -15
- data/lib/nhkore/app.rb +235 -234
- data/lib/nhkore/article.rb +39 -53
- data/lib/nhkore/article_scraper.rb +293 -285
- data/lib/nhkore/cleaner.rb +20 -32
- data/lib/nhkore/cli/fx_cmd.rb +41 -53
- data/lib/nhkore/cli/get_cmd.rb +59 -70
- data/lib/nhkore/cli/news_cmd.rb +143 -153
- data/lib/nhkore/cli/search_cmd.rb +108 -118
- data/lib/nhkore/cli/sift_cmd.rb +109 -120
- data/lib/nhkore/datetime_parser.rb +88 -104
- data/lib/nhkore/defn.rb +48 -55
- data/lib/nhkore/dict.rb +26 -38
- data/lib/nhkore/dict_scraper.rb +31 -40
- data/lib/nhkore/entry.rb +43 -55
- data/lib/nhkore/error.rb +16 -21
- data/lib/nhkore/fileable.rb +10 -21
- data/lib/nhkore/lib.rb +5 -17
- data/lib/nhkore/missingno.rb +21 -33
- data/lib/nhkore/news.rb +58 -72
- data/lib/nhkore/polisher.rb +22 -34
- data/lib/nhkore/scraper.rb +74 -83
- data/lib/nhkore/search_link.rb +62 -76
- data/lib/nhkore/search_scraper.rb +81 -92
- data/lib/nhkore/sifter.rb +157 -171
- data/lib/nhkore/splitter.rb +19 -31
- data/lib/nhkore/user_agents.rb +28 -32
- data/lib/nhkore/util.rb +72 -84
- data/lib/nhkore/variator.rb +20 -32
- data/lib/nhkore/version.rb +4 -16
- data/lib/nhkore/word.rb +99 -97
- data/lib/nhkore.rb +8 -20
- data/nhkore.gemspec +30 -51
- data/samples/looper.rb +18 -29
- data/test/nhkore/test_helper.rb +3 -15
- data/test/nhkore_test.rb +6 -18
- metadata +33 -24
data/lib/nhkore/news.rb
CHANGED
@@ -1,23 +1,11 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
1
|
# encoding: UTF-8
|
3
2
|
# frozen_string_literal: true
|
4
3
|
|
5
4
|
#--
|
6
5
|
# This file is part of NHKore.
|
7
|
-
# Copyright (c) 2020 Jonathan Bradley Whited
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
-
# the Free Software Foundation, either version 3 of the License, or
|
12
|
-
# (at your option) any later version.
|
13
|
-
#
|
14
|
-
# NHKore is distributed in the hope that it will be useful,
|
15
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
-
# GNU Lesser General Public License for more details.
|
18
|
-
#
|
19
|
-
# You should have received a copy of the GNU Lesser General Public License
|
20
|
-
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
6
|
+
# Copyright (c) 2020-2021 Jonathan Bradley Whited
|
7
|
+
#
|
8
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
21
9
|
#++
|
22
10
|
|
23
11
|
|
@@ -29,155 +17,153 @@ require 'nhkore/util'
|
|
29
17
|
|
30
18
|
module NHKore
|
31
19
|
###
|
32
|
-
# @author Jonathan Bradley Whited
|
20
|
+
# @author Jonathan Bradley Whited
|
33
21
|
# @since 0.2.0
|
34
22
|
###
|
35
23
|
class News
|
36
24
|
include Fileable
|
37
|
-
|
25
|
+
|
38
26
|
DEFAULT_DIR = Util::CORE_DIR
|
39
|
-
FAVORED_URL = /https
|
40
|
-
|
27
|
+
FAVORED_URL = /https:/i.freeze
|
28
|
+
|
41
29
|
attr_reader :articles
|
42
30
|
attr_reader :sha256s
|
43
|
-
|
44
|
-
def initialize
|
31
|
+
|
32
|
+
def initialize
|
45
33
|
super()
|
46
|
-
|
34
|
+
|
47
35
|
@articles = {}
|
48
36
|
@sha256s = {}
|
49
37
|
end
|
50
|
-
|
38
|
+
|
51
39
|
def add_article(article,key: nil,overwrite: false)
|
52
40
|
url = article.url
|
53
|
-
url = url.to_s
|
54
|
-
|
55
|
-
key = key.nil?
|
56
|
-
|
41
|
+
url = url.to_s unless url.nil?
|
42
|
+
|
43
|
+
key = key.nil? ? url : key.to_s
|
44
|
+
|
57
45
|
if !overwrite
|
58
46
|
raise ArgumentError,"duplicate article[#{key}] in articles" if @articles.key?(key)
|
59
47
|
raise ArgumentError,"duplicate sha256[#{article.sha256}] in articles" if @sha256s.key?(article.sha256)
|
60
48
|
end
|
61
|
-
|
49
|
+
|
62
50
|
@articles[key] = article
|
63
51
|
@sha256s[article.sha256] = url
|
64
|
-
|
52
|
+
|
65
53
|
return self
|
66
54
|
end
|
67
|
-
|
55
|
+
|
68
56
|
def self.build_file(filename)
|
69
57
|
return File.join(DEFAULT_DIR,filename)
|
70
58
|
end
|
71
|
-
|
59
|
+
|
72
60
|
def encode_with(coder)
|
73
61
|
# Order matters.
|
74
62
|
# Don't output @sha256s.
|
75
|
-
|
63
|
+
|
76
64
|
coder[:articles] = @articles
|
77
65
|
end
|
78
|
-
|
66
|
+
|
79
67
|
def self.load_data(data,article_class: Article,file: nil,news_class: News,overwrite: false,**kargs)
|
80
68
|
data = Util.load_yaml(data,file: file)
|
81
|
-
|
69
|
+
|
82
70
|
articles = data[:articles]
|
83
|
-
|
84
|
-
news = news_class.new
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
news.add_article(article_class.load_data(key,hash),key: key,overwrite: overwrite)
|
90
|
-
end
|
71
|
+
|
72
|
+
news = news_class.new
|
73
|
+
|
74
|
+
articles&.each() do |key,hash|
|
75
|
+
key = key.to_s # Change from a symbol
|
76
|
+
news.add_article(article_class.load_data(key,hash),key: key,overwrite: overwrite)
|
91
77
|
end
|
92
|
-
|
78
|
+
|
93
79
|
return news
|
94
80
|
end
|
95
|
-
|
81
|
+
|
96
82
|
def update_article(article,url)
|
97
|
-
url = url.to_s
|
98
|
-
|
83
|
+
url = url.to_s unless url.nil?
|
84
|
+
|
99
85
|
# Favor https.
|
100
|
-
return if article.url.to_s
|
86
|
+
return if article.url.to_s =~ FAVORED_URL
|
101
87
|
return if url !~ FAVORED_URL
|
102
|
-
|
88
|
+
|
103
89
|
@articles.delete(article.url) # Probably no to_s() here
|
104
90
|
@articles[url] = article
|
105
91
|
article.url = url
|
106
92
|
end
|
107
|
-
|
93
|
+
|
108
94
|
def article(key)
|
109
|
-
key = key.to_s
|
110
|
-
|
95
|
+
key = key.to_s unless key.nil?
|
96
|
+
|
111
97
|
return @articles[key]
|
112
98
|
end
|
113
|
-
|
99
|
+
|
114
100
|
def article_with_sha256(sha256)
|
115
101
|
article = nil
|
116
|
-
|
117
|
-
@articles.
|
102
|
+
|
103
|
+
@articles.each_value do |a|
|
118
104
|
if a.sha256 == sha256
|
119
105
|
article = a
|
120
|
-
|
106
|
+
|
121
107
|
break
|
122
108
|
end
|
123
109
|
end
|
124
|
-
|
110
|
+
|
125
111
|
return article
|
126
112
|
end
|
127
|
-
|
113
|
+
|
128
114
|
def article?(key)
|
129
|
-
key = key.to_s
|
130
|
-
|
115
|
+
key = key.to_s unless key.nil?
|
116
|
+
|
131
117
|
return @articles.key?(key)
|
132
118
|
end
|
133
|
-
|
119
|
+
|
134
120
|
def sha256?(sha256)
|
135
121
|
return @sha256s.key?(sha256)
|
136
122
|
end
|
137
|
-
|
138
|
-
def to_s
|
123
|
+
|
124
|
+
def to_s
|
139
125
|
# Put each Word on one line (flow/inline style).
|
140
126
|
return Util.dump_yaml(self,flow_level: 8)
|
141
127
|
end
|
142
128
|
end
|
143
|
-
|
129
|
+
|
144
130
|
###
|
145
|
-
# @author Jonathan Bradley Whited
|
131
|
+
# @author Jonathan Bradley Whited
|
146
132
|
# @since 0.2.0
|
147
133
|
###
|
148
134
|
class FutsuuNews < News
|
149
135
|
DEFAULT_FILENAME = 'nhk_news_web_regular.yml'
|
150
136
|
DEFAULT_FILE = build_file(DEFAULT_FILENAME)
|
151
|
-
|
137
|
+
|
152
138
|
def self.load_data(data,**kargs)
|
153
139
|
return News.load_data(data,article_class: Article,news_class: FutsuuNews,**kargs)
|
154
140
|
end
|
155
|
-
|
141
|
+
|
156
142
|
def self.load_file(file=DEFAULT_FILE,**kargs)
|
157
143
|
return News.load_file(file,article_class: Article,news_class: FutsuuNews,**kargs)
|
158
144
|
end
|
159
|
-
|
145
|
+
|
160
146
|
def save_file(file=DEFAULT_FILE,**kargs)
|
161
147
|
super(file,**kargs)
|
162
148
|
end
|
163
149
|
end
|
164
|
-
|
150
|
+
|
165
151
|
###
|
166
|
-
# @author Jonathan Bradley Whited
|
152
|
+
# @author Jonathan Bradley Whited
|
167
153
|
# @since 0.2.0
|
168
154
|
###
|
169
155
|
class YasashiiNews < News
|
170
156
|
DEFAULT_FILENAME = 'nhk_news_web_easy.yml'
|
171
157
|
DEFAULT_FILE = build_file(DEFAULT_FILENAME)
|
172
|
-
|
158
|
+
|
173
159
|
def self.load_data(data,**kargs)
|
174
160
|
return News.load_data(data,article_class: Article,news_class: YasashiiNews,**kargs)
|
175
161
|
end
|
176
|
-
|
162
|
+
|
177
163
|
def self.load_file(file=DEFAULT_FILE,**kargs)
|
178
164
|
return News.load_file(file,article_class: Article,news_class: YasashiiNews,**kargs)
|
179
165
|
end
|
180
|
-
|
166
|
+
|
181
167
|
def save_file(file=DEFAULT_FILE,**kargs)
|
182
168
|
super(file,**kargs)
|
183
169
|
end
|
data/lib/nhkore/polisher.rb
CHANGED
@@ -1,23 +1,11 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
1
|
# encoding: UTF-8
|
3
2
|
# frozen_string_literal: true
|
4
3
|
|
5
4
|
#--
|
6
5
|
# This file is part of NHKore.
|
7
|
-
# Copyright (c) 2020 Jonathan Bradley Whited
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
-
# the Free Software Foundation, either version 3 of the License, or
|
12
|
-
# (at your option) any later version.
|
13
|
-
#
|
14
|
-
# NHKore is distributed in the hope that it will be useful,
|
15
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
-
# GNU Lesser General Public License for more details.
|
18
|
-
#
|
19
|
-
# You should have received a copy of the GNU Lesser General Public License
|
20
|
-
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
6
|
+
# Copyright (c) 2020-2021 Jonathan Bradley Whited
|
7
|
+
#
|
8
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
21
9
|
#++
|
22
10
|
|
23
11
|
|
@@ -26,28 +14,28 @@ require 'nhkore/word'
|
|
26
14
|
|
27
15
|
module NHKore
|
28
16
|
###
|
29
|
-
# @author Jonathan Bradley Whited
|
17
|
+
# @author Jonathan Bradley Whited
|
30
18
|
# @since 0.2.0
|
31
19
|
###
|
32
20
|
class Polisher
|
33
21
|
def begin_polish(str)
|
34
22
|
return str
|
35
23
|
end
|
36
|
-
|
24
|
+
|
37
25
|
def polish(str)
|
38
26
|
str = begin_polish(str)
|
39
27
|
str = end_polish(str)
|
40
|
-
|
28
|
+
|
41
29
|
return str
|
42
30
|
end
|
43
|
-
|
31
|
+
|
44
32
|
def self.polish_any(obj,polishers)
|
45
|
-
return nil if obj.nil?
|
46
|
-
|
33
|
+
return nil if obj.nil?
|
34
|
+
|
47
35
|
polishers = Array(polishers)
|
48
|
-
|
49
|
-
return obj if polishers.empty?
|
50
|
-
|
36
|
+
|
37
|
+
return obj if polishers.empty?
|
38
|
+
|
51
39
|
if obj.is_a?(Word)
|
52
40
|
obj = Word.new(
|
53
41
|
kana: polish_any(obj.kana,polishers),
|
@@ -55,17 +43,17 @@ module NHKore
|
|
55
43
|
word: obj
|
56
44
|
)
|
57
45
|
else # String
|
58
|
-
polishers.each
|
46
|
+
polishers.each do |polisher|
|
59
47
|
obj = polisher.polish(obj)
|
60
48
|
end
|
61
49
|
end
|
62
|
-
|
50
|
+
|
63
51
|
return obj
|
64
52
|
end
|
65
53
|
end
|
66
|
-
|
54
|
+
|
67
55
|
###
|
68
|
-
# @author Jonathan Bradley Whited
|
56
|
+
# @author Jonathan Bradley Whited
|
69
57
|
# @since 0.2.0
|
70
58
|
###
|
71
59
|
class BasicPolisher < Polisher
|
@@ -74,18 +62,18 @@ module NHKore
|
|
74
62
|
# - Yunibaasaru・Sutajio・Japan
|
75
63
|
# Keep numbers next to kanji/kana, else the below kana won't make sense:
|
76
64
|
# - Word { kanji: 20日, kana: はつか }
|
77
|
-
|
65
|
+
|
78
66
|
str = str.gsub(/[^[[:alnum:]]・]/,'')
|
79
|
-
|
67
|
+
|
80
68
|
# Numbers/dots by themselves (without kanji/kana) should be ignored (empty).
|
81
|
-
str = '' if str.gsub(/[[[:digit:]]・]+/,'').empty?
|
82
|
-
|
69
|
+
str = '' if str.gsub(/[[[:digit:]]・]+/,'').empty?
|
70
|
+
|
83
71
|
return str
|
84
72
|
end
|
85
73
|
end
|
86
|
-
|
74
|
+
|
87
75
|
###
|
88
|
-
# @author Jonathan Bradley Whited
|
76
|
+
# @author Jonathan Bradley Whited
|
89
77
|
# @since 0.2.0
|
90
78
|
###
|
91
79
|
class BestPolisher < BasicPolisher
|
data/lib/nhkore/scraper.rb
CHANGED
@@ -1,23 +1,11 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
1
|
# encoding: UTF-8
|
3
2
|
# frozen_string_literal: true
|
4
3
|
|
5
4
|
#--
|
6
5
|
# This file is part of NHKore.
|
7
|
-
# Copyright (c) 2020 Jonathan Bradley Whited
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
-
# the Free Software Foundation, either version 3 of the License, or
|
12
|
-
# (at your option) any later version.
|
13
|
-
#
|
14
|
-
# NHKore is distributed in the hope that it will be useful,
|
15
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
-
# GNU Lesser General Public License for more details.
|
18
|
-
#
|
19
|
-
# You should have received a copy of the GNU Lesser General Public License
|
20
|
-
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
6
|
+
# Copyright (c) 2020-2021 Jonathan Bradley Whited
|
7
|
+
#
|
8
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
21
9
|
#++
|
22
10
|
|
23
11
|
|
@@ -31,18 +19,19 @@ require 'nhkore/util'
|
|
31
19
|
|
32
20
|
module NHKore
|
33
21
|
###
|
34
|
-
# @author Jonathan Bradley Whited
|
22
|
+
# @author Jonathan Bradley Whited
|
35
23
|
# @since 0.2.0
|
36
24
|
###
|
37
25
|
class Scraper
|
38
26
|
extend AttrBool::Ext
|
39
|
-
|
27
|
+
|
40
28
|
DEFAULT_HEADER = {
|
41
|
-
'user-agent' => UserAgents.sample
|
42
|
-
'accept' => 'text/html,application/xhtml+xml,application/xml,application/rss+xml,text/xml;image/webp
|
29
|
+
'user-agent' => UserAgents.sample,
|
30
|
+
'accept' => 'text/html,application/xhtml+xml,application/xml,application/rss+xml,text/xml;image/webp' \
|
31
|
+
',image/apng,*/*;application/signed-exchange',
|
43
32
|
'dnt' => '1',
|
44
|
-
}
|
45
|
-
|
33
|
+
}.freeze
|
34
|
+
|
46
35
|
attr_accessor? :eat_cookie
|
47
36
|
attr_accessor? :is_file
|
48
37
|
attr_reader :kargs
|
@@ -51,80 +40,81 @@ module NHKore
|
|
51
40
|
attr_accessor :redirect_rule
|
52
41
|
attr_accessor :str_or_io
|
53
42
|
attr_accessor :url
|
54
|
-
|
43
|
+
|
55
44
|
# +max_redirects+ defaults to 3 for safety (infinite-loop attack).
|
56
|
-
#
|
45
|
+
#
|
57
46
|
# All URL options: https://ruby-doc.org/stdlib-2.7.0/libdoc/open-uri/rdoc/OpenURI/OpenRead.html
|
58
|
-
#
|
47
|
+
#
|
59
48
|
# Pass in +header: {}+ for the default HTTP header fields to be set.
|
60
|
-
#
|
49
|
+
#
|
61
50
|
# @param eat_cookie [true,false] true to set the HTTP header field 'cookie', which can be an expensive
|
62
51
|
# (time-consuming) operation since it opens the URL again, but necessary for some URLs.
|
63
52
|
# @param redirect_rule [nil,:lenient,:strict]
|
64
|
-
def initialize(url,eat_cookie: false,header: nil,is_file: false,max_redirects: 3,max_retries: 3,
|
53
|
+
def initialize(url,eat_cookie: false,header: nil,is_file: false,max_redirects: 3,max_retries: 3,
|
54
|
+
redirect_rule: :strict,str_or_io: nil,**kargs)
|
65
55
|
super()
|
66
|
-
|
67
|
-
if !header.nil?
|
56
|
+
|
57
|
+
if !header.nil? && !is_file
|
68
58
|
# Some sites (Search Engines) hate scrapers, so need HTTP header fields.
|
69
59
|
# If this isn't enough, look at googler for more header fields to set:
|
70
60
|
# - https://github.com/jarun/googler
|
71
61
|
# If necessary, can use Faraday, HTTParty, or RestClient gem and
|
72
62
|
# pass in to str_or_io.
|
73
|
-
|
63
|
+
|
74
64
|
header = DEFAULT_HEADER.merge(header)
|
75
65
|
kargs.merge!(header)
|
76
66
|
end
|
77
|
-
|
67
|
+
|
78
68
|
@eat_cookie = eat_cookie
|
79
69
|
@is_file = is_file
|
80
70
|
@kargs = kargs
|
81
71
|
@max_redirects = max_redirects
|
82
72
|
@max_retries = max_retries
|
83
73
|
@redirect_rule = redirect_rule
|
84
|
-
|
85
|
-
open(url,str_or_io,is_file: is_file)
|
74
|
+
|
75
|
+
self.open(url,str_or_io,is_file: is_file)
|
86
76
|
end
|
87
|
-
|
77
|
+
|
88
78
|
def fetch_cookie(url)
|
89
79
|
require 'http-cookie'
|
90
|
-
|
80
|
+
|
91
81
|
open_url(url)
|
92
|
-
|
82
|
+
|
93
83
|
cookies = Array(@str_or_io.meta['set-cookie']) # nil will be []
|
94
|
-
|
95
|
-
if !cookies.empty?
|
96
|
-
jar = HTTP::CookieJar.new
|
84
|
+
|
85
|
+
if !cookies.empty?
|
86
|
+
jar = HTTP::CookieJar.new
|
97
87
|
uri = URI(url)
|
98
|
-
|
99
|
-
cookies.each
|
88
|
+
|
89
|
+
cookies.each do |cookie|
|
100
90
|
jar.parse(cookie,uri)
|
101
91
|
end
|
102
|
-
|
92
|
+
|
103
93
|
@kargs['cookie'] = HTTP::Cookie.cookie_value(jar.cookies(uri))
|
104
94
|
end
|
105
|
-
|
95
|
+
|
106
96
|
return self
|
107
97
|
end
|
108
|
-
|
109
|
-
def html_doc
|
98
|
+
|
99
|
+
def html_doc
|
110
100
|
return Nokogiri::HTML(@str_or_io)
|
111
101
|
end
|
112
|
-
|
102
|
+
|
113
103
|
def join_url(relative_url)
|
114
104
|
# For a file, don't know what to do.
|
115
105
|
# It would be unsafe to return something else;
|
116
106
|
# for example, it could return a lot of "../../../" to your root dir.
|
117
107
|
return nil if @is_file
|
118
|
-
|
119
|
-
return URI
|
108
|
+
|
109
|
+
return URI.join(@url,relative_url)
|
120
110
|
end
|
121
|
-
|
111
|
+
|
122
112
|
def open(url,str_or_io=nil,is_file: @is_file)
|
123
113
|
@is_file = is_file
|
124
114
|
@str_or_io = str_or_io
|
125
115
|
@url = url
|
126
|
-
|
127
|
-
if str_or_io.nil?
|
116
|
+
|
117
|
+
if str_or_io.nil?
|
128
118
|
if @is_file
|
129
119
|
open_file(url)
|
130
120
|
else
|
@@ -132,85 +122,86 @@ module NHKore
|
|
132
122
|
open_url(url)
|
133
123
|
end
|
134
124
|
end
|
135
|
-
|
125
|
+
|
136
126
|
return self
|
137
127
|
end
|
138
|
-
|
128
|
+
|
139
129
|
def open_file(file)
|
140
130
|
@is_file = true
|
141
131
|
@url = file
|
142
|
-
|
132
|
+
|
143
133
|
# NHK's website tends to always use UTF-8.
|
144
134
|
@str_or_io = File.open(file,'rt:UTF-8',**@kargs)
|
145
|
-
|
135
|
+
|
146
136
|
return self
|
147
137
|
end
|
148
|
-
|
138
|
+
|
149
139
|
def open_url(url)
|
150
|
-
max_redirects = (@max_redirects.nil?
|
151
|
-
max_retries = (@max_retries.nil?
|
152
|
-
|
140
|
+
max_redirects = (@max_redirects.nil? || @max_redirects < 0) ? 10_000 : @max_redirects
|
141
|
+
max_retries = (@max_retries.nil? || @max_retries < 0) ? 10_000 : @max_retries
|
142
|
+
|
153
143
|
top_uri = URI(url)
|
154
144
|
top_domain = Util.domain(top_uri.host)
|
155
|
-
|
145
|
+
|
156
146
|
begin
|
157
|
-
# Use URI.open() instead of (Kernel.)open() for safety (code-injection attack).
|
158
|
-
|
147
|
+
# Use URI().open() instead of URI.open()/(Kernel.)open() for safety (code-injection attack).
|
148
|
+
# Use URI() instead of URI.parse() because url can be a URI (not just a string).
|
149
|
+
@str_or_io = URI(url).open(redirect: false,**@kargs)
|
159
150
|
@url = url
|
160
151
|
rescue OpenURI::HTTPRedirect => redirect
|
161
152
|
redirect_uri = redirect.uri
|
162
|
-
|
153
|
+
|
163
154
|
if (max_redirects -= 1) < 0
|
164
155
|
raise redirect.exception("redirected to URL[#{redirect_uri}]: #{redirect}")
|
165
156
|
end
|
166
|
-
|
157
|
+
|
167
158
|
case @redirect_rule
|
168
159
|
when :lenient,:strict
|
169
160
|
if redirect_uri.scheme != top_uri.scheme
|
170
|
-
raise redirect.exception("redirect scheme[#{redirect_uri.scheme}] does not match original "
|
161
|
+
raise redirect.exception("redirect scheme[#{redirect_uri.scheme}] does not match original " \
|
171
162
|
"scheme[#{top_uri.scheme}] at redirect URL[#{redirect_uri}]: #{redirect}")
|
172
163
|
end
|
173
|
-
|
164
|
+
|
174
165
|
if @redirect_rule == :strict
|
175
166
|
redirect_domain = Util.domain(redirect_uri.host)
|
176
|
-
|
167
|
+
|
177
168
|
if redirect_domain != top_domain
|
178
|
-
raise redirect.exception("redirect domain[#{redirect_domain}] does not match original "
|
169
|
+
raise redirect.exception("redirect domain[#{redirect_domain}] does not match original " \
|
179
170
|
"domain[#{top_domain}] at redirect URL[#{redirect_uri}]: #{redirect}")
|
180
171
|
end
|
181
172
|
end
|
182
173
|
end
|
183
|
-
|
174
|
+
|
184
175
|
url = redirect_uri
|
185
|
-
|
176
|
+
|
186
177
|
retry
|
187
178
|
# Must come after HTTPRedirect since a subclass of HTTPError.
|
188
179
|
rescue OpenURI::HTTPError => e
|
189
|
-
raise e.exception("HTTP error[#{e
|
180
|
+
raise e.exception("HTTP error[#{e}] at URL[#{url}]")
|
190
181
|
rescue SocketError => e
|
191
182
|
if (max_retries -= 1) < 0
|
192
|
-
raise e.exception("Socket error[#{e
|
183
|
+
raise e.exception("Socket error[#{e}] at URL[#{url}]")
|
193
184
|
end
|
194
|
-
|
185
|
+
|
195
186
|
retry
|
196
187
|
end
|
197
|
-
|
188
|
+
|
198
189
|
return self
|
199
190
|
end
|
200
|
-
|
201
|
-
def read
|
202
|
-
@str_or_io = @str_or_io.read
|
203
|
-
|
191
|
+
|
192
|
+
def read
|
193
|
+
@str_or_io = @str_or_io.read if @str_or_io.respond_to?(:read)
|
194
|
+
|
204
195
|
return @str_or_io
|
205
196
|
end
|
206
|
-
|
207
|
-
def reopen
|
208
|
-
return open(@url)
|
197
|
+
|
198
|
+
def reopen
|
199
|
+
return self.open(@url)
|
209
200
|
end
|
210
|
-
|
211
|
-
def rss_doc
|
201
|
+
|
202
|
+
def rss_doc
|
212
203
|
require 'rss'
|
213
|
-
|
204
|
+
|
214
205
|
return RSS::Parser.parse(@str_or_io,validate: false)
|
215
206
|
end
|
216
207
|
end
|