nhkore 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +35 -1
- data/README.md +305 -17
- data/Rakefile +10 -13
- data/lib/nhkore.rb +2 -1
- data/lib/nhkore/app.rb +66 -43
- data/lib/nhkore/article_scraper.rb +2 -2
- data/lib/nhkore/cli/fx_cmd.rb +1 -1
- data/lib/nhkore/cli/get_cmd.rb +27 -12
- data/lib/nhkore/cli/news_cmd.rb +19 -7
- data/lib/nhkore/cli/{bing_cmd.rb → search_cmd.rb} +125 -52
- data/lib/nhkore/scraper.rb +123 -59
- data/lib/nhkore/search_link.rb +4 -4
- data/lib/nhkore/search_scraper.rb +70 -15
- data/lib/nhkore/user_agents.rb +1179 -0
- data/lib/nhkore/util.rb +36 -1
- data/lib/nhkore/version.rb +1 -1
- data/nhkore.gemspec +30 -18
- metadata +22 -4
data/lib/nhkore/util.rb
CHANGED
@@ -24,7 +24,9 @@
|
|
24
24
|
require 'cgi'
|
25
25
|
require 'psychgus'
|
26
26
|
require 'public_suffix'
|
27
|
+
require 'set'
|
27
28
|
require 'time'
|
29
|
+
require 'uri'
|
28
30
|
|
29
31
|
|
30
32
|
module NHKore
|
@@ -69,7 +71,7 @@ module NHKore
|
|
69
71
|
|
70
72
|
def self.domain(host,clean: true)
|
71
73
|
domain = PublicSuffix.domain(host)
|
72
|
-
domain = unspace_web_str(domain).downcase() if clean
|
74
|
+
domain = unspace_web_str(domain).downcase() if !domain.nil?() && clean
|
73
75
|
|
74
76
|
return domain
|
75
77
|
end
|
@@ -164,6 +166,39 @@ module NHKore
|
|
164
166
|
return str.gsub(WEB_SPACES_REGEX,' ')
|
165
167
|
end
|
166
168
|
|
169
|
+
def self.replace_uri_query!(uri,**new_query)
|
170
|
+
return uri if new_query.empty?()
|
171
|
+
|
172
|
+
query = uri.query
|
173
|
+
query = query.nil?() ? [] : URI.decode_www_form(query)
|
174
|
+
|
175
|
+
# First, remove the old ones.
|
176
|
+
if !query.empty?()
|
177
|
+
new_query_keys = Set.new(new_query.keys.map() {|key|
|
178
|
+
unspace_web_str(key.to_s()).downcase()
|
179
|
+
})
|
180
|
+
|
181
|
+
query.filter!() do |q|
|
182
|
+
if q.nil?() || q.empty?()
|
183
|
+
false
|
184
|
+
else
|
185
|
+
key = unspace_web_str(q[0].to_s()).downcase()
|
186
|
+
|
187
|
+
!new_query_keys.include?(key)
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
# Next, add the new ones.
|
193
|
+
new_query.each() do |key,value|
|
194
|
+
query << [key,value.nil?() ? '' : value]
|
195
|
+
end
|
196
|
+
|
197
|
+
uri.query = URI.encode_www_form(query)
|
198
|
+
|
199
|
+
return uri
|
200
|
+
end
|
201
|
+
|
167
202
|
def self.sane_year?(year)
|
168
203
|
return year >= MIN_SANE_YEAR && year <= MAX_SANE_YEAR
|
169
204
|
end
|
data/lib/nhkore/version.rb
CHANGED
data/nhkore.gemspec
CHANGED
@@ -34,10 +34,9 @@ Gem::Specification.new() do |spec|
|
|
34
34
|
spec.licenses = ['LGPL-3.0-or-later']
|
35
35
|
spec.homepage = 'https://github.com/esotericpig/nhkore'
|
36
36
|
spec.summary = 'NHK News Web (Easy) word frequency (core) scraper for Japanese language learners.'
|
37
|
-
spec.description =
|
38
|
-
Scrapes NHK News Web (Easy) for the word frequency (core list) for Japanese language learners.
|
39
|
-
Includes a CLI app and a scraper library.
|
40
|
-
EOD
|
37
|
+
spec.description =
|
38
|
+
'Scrapes NHK News Web (Easy) for the word frequency (core list) for Japanese language learners.' \
|
39
|
+
' Includes a CLI app and a scraper library.'
|
41
40
|
|
42
41
|
spec.metadata = {
|
43
42
|
'bug_tracker_uri' => 'https://github.com/esotericpig/nhkore/issues',
|
@@ -60,19 +59,20 @@ Gem::Specification.new() do |spec|
|
|
60
59
|
|
61
60
|
spec.requirements << 'Nokogiri: https://www.nokogiri.org/tutorials/installing_nokogiri.html'
|
62
61
|
|
63
|
-
spec.add_runtime_dependency 'bimyou_segmenter'
|
64
|
-
spec.add_runtime_dependency 'cri'
|
65
|
-
spec.add_runtime_dependency 'down'
|
66
|
-
spec.add_runtime_dependency 'highline'
|
67
|
-
spec.add_runtime_dependency '
|
68
|
-
spec.add_runtime_dependency '
|
69
|
-
spec.add_runtime_dependency '
|
70
|
-
spec.add_runtime_dependency '
|
71
|
-
spec.add_runtime_dependency '
|
72
|
-
spec.add_runtime_dependency '
|
73
|
-
spec.add_runtime_dependency '
|
74
|
-
spec.add_runtime_dependency '
|
75
|
-
spec.add_runtime_dependency 'tty-
|
62
|
+
spec.add_runtime_dependency 'bimyou_segmenter' ,'~> 1.2' # For splitting Japanese sentences into words
|
63
|
+
spec.add_runtime_dependency 'cri' ,'~> 2.15' # For CLI commands/options
|
64
|
+
spec.add_runtime_dependency 'down' ,'~> 5.1' # For downloading files (GetCmd)
|
65
|
+
spec.add_runtime_dependency 'highline' ,'~> 2.0' # For CLI input/output
|
66
|
+
spec.add_runtime_dependency 'http-cookie' ,'~> 1.0' # For parsing/setting cookies (BingScraper/Scraper)
|
67
|
+
spec.add_runtime_dependency 'japanese_deinflector' ,'~> 0.0' # For unconjugating Japanese words (plain/dictionary form)
|
68
|
+
spec.add_runtime_dependency 'nokogiri' ,'~> 1.10' # For scraping/hacking
|
69
|
+
spec.add_runtime_dependency 'psychgus' ,'~> 1.2' # For styling Psych YAML
|
70
|
+
spec.add_runtime_dependency 'public_suffix' ,'~> 4.0' # For parsing URL domain names
|
71
|
+
spec.add_runtime_dependency 'rainbow' ,'~> 3.0' # For CLI color output
|
72
|
+
spec.add_runtime_dependency 'rubyzip' ,'~> 2.3' # For extracting Zip files (GetCmd)
|
73
|
+
spec.add_runtime_dependency 'tiny_segmenter' ,'~> 0.0' # For splitting Japanese sentences into words
|
74
|
+
spec.add_runtime_dependency 'tty-progressbar' ,'~> 0.17' # For CLI progress bars
|
75
|
+
spec.add_runtime_dependency 'tty-spinner' ,'~> 0.9' # For CLI spinning progress
|
76
76
|
|
77
77
|
spec.add_development_dependency 'bundler' ,'~> 2.1'
|
78
78
|
spec.add_development_dependency 'minitest' ,'~> 5.14'
|
@@ -83,5 +83,17 @@ Gem::Specification.new() do |spec|
|
|
83
83
|
spec.add_development_dependency 'yard' ,'~> 0.9' # For documentation
|
84
84
|
spec.add_development_dependency 'yard_ghurt','~> 1.2' # For extra YARDoc Rake tasks
|
85
85
|
|
86
|
-
spec.post_install_message =
|
86
|
+
spec.post_install_message = <<-EOM
|
87
|
+
|
88
|
+
NHKore v#{NHKore::VERSION}
|
89
|
+
|
90
|
+
You can now use [#{spec.executables.join(', ')}] on the command line.
|
91
|
+
|
92
|
+
Homepage: #{spec.homepage}
|
93
|
+
|
94
|
+
Code: #{spec.metadata['source_code_uri']}
|
95
|
+
Changelog: #{spec.metadata['changelog_uri']}
|
96
|
+
Bugs: #{spec.metadata['bug_tracker_uri']}
|
97
|
+
|
98
|
+
EOM
|
87
99
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nhkore
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Bradley Whited (@esotericpig)
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-04-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bimyou_segmenter
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '2.0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: http-cookie
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.0'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: japanese_deinflector
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -324,10 +338,10 @@ files:
|
|
324
338
|
- lib/nhkore/article.rb
|
325
339
|
- lib/nhkore/article_scraper.rb
|
326
340
|
- lib/nhkore/cleaner.rb
|
327
|
-
- lib/nhkore/cli/bing_cmd.rb
|
328
341
|
- lib/nhkore/cli/fx_cmd.rb
|
329
342
|
- lib/nhkore/cli/get_cmd.rb
|
330
343
|
- lib/nhkore/cli/news_cmd.rb
|
344
|
+
- lib/nhkore/cli/search_cmd.rb
|
331
345
|
- lib/nhkore/cli/sift_cmd.rb
|
332
346
|
- lib/nhkore/defn.rb
|
333
347
|
- lib/nhkore/dict.rb
|
@@ -343,6 +357,7 @@ files:
|
|
343
357
|
- lib/nhkore/search_scraper.rb
|
344
358
|
- lib/nhkore/sifter.rb
|
345
359
|
- lib/nhkore/splitter.rb
|
360
|
+
- lib/nhkore/user_agents.rb
|
346
361
|
- lib/nhkore/util.rb
|
347
362
|
- lib/nhkore/variator.rb
|
348
363
|
- lib/nhkore/version.rb
|
@@ -359,7 +374,10 @@ metadata:
|
|
359
374
|
changelog_uri: https://github.com/esotericpig/nhkore/blob/master/CHANGELOG.md
|
360
375
|
homepage_uri: https://github.com/esotericpig/nhkore
|
361
376
|
source_code_uri: https://github.com/esotericpig/nhkore
|
362
|
-
post_install_message: You can now use [nhkore] on the
|
377
|
+
post_install_message: " \n NHKore v0.3.0\n \n You can now use [nhkore] on the
|
378
|
+
command line.\n \n Homepage: https://github.com/esotericpig/nhkore\n \n Code:
|
379
|
+
\ https://github.com/esotericpig/nhkore\n Changelog: https://github.com/esotericpig/nhkore/blob/master/CHANGELOG.md\n
|
380
|
+
\ Bugs: https://github.com/esotericpig/nhkore/issues\n \n"
|
363
381
|
rdoc_options: []
|
364
382
|
require_paths:
|
365
383
|
- lib
|