nhkore 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +35 -1
- data/README.md +305 -17
- data/Rakefile +10 -13
- data/lib/nhkore.rb +2 -1
- data/lib/nhkore/app.rb +66 -43
- data/lib/nhkore/article_scraper.rb +2 -2
- data/lib/nhkore/cli/fx_cmd.rb +1 -1
- data/lib/nhkore/cli/get_cmd.rb +27 -12
- data/lib/nhkore/cli/news_cmd.rb +19 -7
- data/lib/nhkore/cli/{bing_cmd.rb → search_cmd.rb} +125 -52
- data/lib/nhkore/scraper.rb +123 -59
- data/lib/nhkore/search_link.rb +4 -4
- data/lib/nhkore/search_scraper.rb +70 -15
- data/lib/nhkore/user_agents.rb +1179 -0
- data/lib/nhkore/util.rb +36 -1
- data/lib/nhkore/version.rb +1 -1
- data/nhkore.gemspec +30 -18
- metadata +22 -4
data/lib/nhkore/util.rb
CHANGED
@@ -24,7 +24,9 @@
|
|
24
24
|
require 'cgi'
|
25
25
|
require 'psychgus'
|
26
26
|
require 'public_suffix'
|
27
|
+
require 'set'
|
27
28
|
require 'time'
|
29
|
+
require 'uri'
|
28
30
|
|
29
31
|
|
30
32
|
module NHKore
|
@@ -69,7 +71,7 @@ module NHKore
|
|
69
71
|
|
70
72
|
def self.domain(host,clean: true)
|
71
73
|
domain = PublicSuffix.domain(host)
|
72
|
-
domain = unspace_web_str(domain).downcase() if clean
|
74
|
+
domain = unspace_web_str(domain).downcase() if !domain.nil?() && clean
|
73
75
|
|
74
76
|
return domain
|
75
77
|
end
|
@@ -164,6 +166,39 @@ module NHKore
|
|
164
166
|
return str.gsub(WEB_SPACES_REGEX,' ')
|
165
167
|
end
|
166
168
|
|
169
|
+
def self.replace_uri_query!(uri,**new_query)
|
170
|
+
return uri if new_query.empty?()
|
171
|
+
|
172
|
+
query = uri.query
|
173
|
+
query = query.nil?() ? [] : URI.decode_www_form(query)
|
174
|
+
|
175
|
+
# First, remove the old ones.
|
176
|
+
if !query.empty?()
|
177
|
+
new_query_keys = Set.new(new_query.keys.map() {|key|
|
178
|
+
unspace_web_str(key.to_s()).downcase()
|
179
|
+
})
|
180
|
+
|
181
|
+
query.filter!() do |q|
|
182
|
+
if q.nil?() || q.empty?()
|
183
|
+
false
|
184
|
+
else
|
185
|
+
key = unspace_web_str(q[0].to_s()).downcase()
|
186
|
+
|
187
|
+
!new_query_keys.include?(key)
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
# Next, add the new ones.
|
193
|
+
new_query.each() do |key,value|
|
194
|
+
query << [key,value.nil?() ? '' : value]
|
195
|
+
end
|
196
|
+
|
197
|
+
uri.query = URI.encode_www_form(query)
|
198
|
+
|
199
|
+
return uri
|
200
|
+
end
|
201
|
+
|
167
202
|
def self.sane_year?(year)
|
168
203
|
return year >= MIN_SANE_YEAR && year <= MAX_SANE_YEAR
|
169
204
|
end
|
data/lib/nhkore/version.rb
CHANGED
data/nhkore.gemspec
CHANGED
@@ -34,10 +34,9 @@ Gem::Specification.new() do |spec|
|
|
34
34
|
spec.licenses = ['LGPL-3.0-or-later']
|
35
35
|
spec.homepage = 'https://github.com/esotericpig/nhkore'
|
36
36
|
spec.summary = 'NHK News Web (Easy) word frequency (core) scraper for Japanese language learners.'
|
37
|
-
spec.description =
|
38
|
-
Scrapes NHK News Web (Easy) for the word frequency (core list) for Japanese language learners.
|
39
|
-
Includes a CLI app and a scraper library.
|
40
|
-
EOD
|
37
|
+
spec.description =
|
38
|
+
'Scrapes NHK News Web (Easy) for the word frequency (core list) for Japanese language learners.' \
|
39
|
+
' Includes a CLI app and a scraper library.'
|
41
40
|
|
42
41
|
spec.metadata = {
|
43
42
|
'bug_tracker_uri' => 'https://github.com/esotericpig/nhkore/issues',
|
@@ -60,19 +59,20 @@ Gem::Specification.new() do |spec|
|
|
60
59
|
|
61
60
|
spec.requirements << 'Nokogiri: https://www.nokogiri.org/tutorials/installing_nokogiri.html'
|
62
61
|
|
63
|
-
spec.add_runtime_dependency 'bimyou_segmenter'
|
64
|
-
spec.add_runtime_dependency 'cri'
|
65
|
-
spec.add_runtime_dependency 'down'
|
66
|
-
spec.add_runtime_dependency 'highline'
|
67
|
-
spec.add_runtime_dependency '
|
68
|
-
spec.add_runtime_dependency '
|
69
|
-
spec.add_runtime_dependency '
|
70
|
-
spec.add_runtime_dependency '
|
71
|
-
spec.add_runtime_dependency '
|
72
|
-
spec.add_runtime_dependency '
|
73
|
-
spec.add_runtime_dependency '
|
74
|
-
spec.add_runtime_dependency '
|
75
|
-
spec.add_runtime_dependency 'tty-
|
62
|
+
spec.add_runtime_dependency 'bimyou_segmenter' ,'~> 1.2' # For splitting Japanese sentences into words
|
63
|
+
spec.add_runtime_dependency 'cri' ,'~> 2.15' # For CLI commands/options
|
64
|
+
spec.add_runtime_dependency 'down' ,'~> 5.1' # For downloading files (GetCmd)
|
65
|
+
spec.add_runtime_dependency 'highline' ,'~> 2.0' # For CLI input/output
|
66
|
+
spec.add_runtime_dependency 'http-cookie' ,'~> 1.0' # For parsing/setting cookies (BingScraper/Scraper)
|
67
|
+
spec.add_runtime_dependency 'japanese_deinflector' ,'~> 0.0' # For unconjugating Japanese words (plain/dictionary form)
|
68
|
+
spec.add_runtime_dependency 'nokogiri' ,'~> 1.10' # For scraping/hacking
|
69
|
+
spec.add_runtime_dependency 'psychgus' ,'~> 1.2' # For styling Psych YAML
|
70
|
+
spec.add_runtime_dependency 'public_suffix' ,'~> 4.0' # For parsing URL domain names
|
71
|
+
spec.add_runtime_dependency 'rainbow' ,'~> 3.0' # For CLI color output
|
72
|
+
spec.add_runtime_dependency 'rubyzip' ,'~> 2.3' # For extracting Zip files (GetCmd)
|
73
|
+
spec.add_runtime_dependency 'tiny_segmenter' ,'~> 0.0' # For splitting Japanese sentences into words
|
74
|
+
spec.add_runtime_dependency 'tty-progressbar' ,'~> 0.17' # For CLI progress bars
|
75
|
+
spec.add_runtime_dependency 'tty-spinner' ,'~> 0.9' # For CLI spinning progress
|
76
76
|
|
77
77
|
spec.add_development_dependency 'bundler' ,'~> 2.1'
|
78
78
|
spec.add_development_dependency 'minitest' ,'~> 5.14'
|
@@ -83,5 +83,17 @@ Gem::Specification.new() do |spec|
|
|
83
83
|
spec.add_development_dependency 'yard' ,'~> 0.9' # For documentation
|
84
84
|
spec.add_development_dependency 'yard_ghurt','~> 1.2' # For extra YARDoc Rake tasks
|
85
85
|
|
86
|
-
spec.post_install_message =
|
86
|
+
spec.post_install_message = <<-EOM
|
87
|
+
|
88
|
+
NHKore v#{NHKore::VERSION}
|
89
|
+
|
90
|
+
You can now use [#{spec.executables.join(', ')}] on the command line.
|
91
|
+
|
92
|
+
Homepage: #{spec.homepage}
|
93
|
+
|
94
|
+
Code: #{spec.metadata['source_code_uri']}
|
95
|
+
Changelog: #{spec.metadata['changelog_uri']}
|
96
|
+
Bugs: #{spec.metadata['bug_tracker_uri']}
|
97
|
+
|
98
|
+
EOM
|
87
99
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nhkore
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Bradley Whited (@esotericpig)
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-04-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bimyou_segmenter
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '2.0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: http-cookie
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.0'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: japanese_deinflector
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -324,10 +338,10 @@ files:
|
|
324
338
|
- lib/nhkore/article.rb
|
325
339
|
- lib/nhkore/article_scraper.rb
|
326
340
|
- lib/nhkore/cleaner.rb
|
327
|
-
- lib/nhkore/cli/bing_cmd.rb
|
328
341
|
- lib/nhkore/cli/fx_cmd.rb
|
329
342
|
- lib/nhkore/cli/get_cmd.rb
|
330
343
|
- lib/nhkore/cli/news_cmd.rb
|
344
|
+
- lib/nhkore/cli/search_cmd.rb
|
331
345
|
- lib/nhkore/cli/sift_cmd.rb
|
332
346
|
- lib/nhkore/defn.rb
|
333
347
|
- lib/nhkore/dict.rb
|
@@ -343,6 +357,7 @@ files:
|
|
343
357
|
- lib/nhkore/search_scraper.rb
|
344
358
|
- lib/nhkore/sifter.rb
|
345
359
|
- lib/nhkore/splitter.rb
|
360
|
+
- lib/nhkore/user_agents.rb
|
346
361
|
- lib/nhkore/util.rb
|
347
362
|
- lib/nhkore/variator.rb
|
348
363
|
- lib/nhkore/version.rb
|
@@ -359,7 +374,10 @@ metadata:
|
|
359
374
|
changelog_uri: https://github.com/esotericpig/nhkore/blob/master/CHANGELOG.md
|
360
375
|
homepage_uri: https://github.com/esotericpig/nhkore
|
361
376
|
source_code_uri: https://github.com/esotericpig/nhkore
|
362
|
-
post_install_message: You can now use [nhkore] on the
|
377
|
+
post_install_message: " \n NHKore v0.3.0\n \n You can now use [nhkore] on the
|
378
|
+
command line.\n \n Homepage: https://github.com/esotericpig/nhkore\n \n Code:
|
379
|
+
\ https://github.com/esotericpig/nhkore\n Changelog: https://github.com/esotericpig/nhkore/blob/master/CHANGELOG.md\n
|
380
|
+
\ Bugs: https://github.com/esotericpig/nhkore/issues\n \n"
|
363
381
|
rdoc_options: []
|
364
382
|
require_paths:
|
365
383
|
- lib
|