nhkore 0.3.19 → 0.3.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -5
- data/Gemfile.lock +2 -4
- data/README.md +6 -1
- data/lib/nhkore/cli/get_cmd.rb +5 -5
- data/lib/nhkore/lib.rb +1 -0
- data/lib/nhkore/scraper.rb +2 -2
- data/lib/nhkore/search_scraper.rb +6 -8
- data/lib/nhkore/sifter.rb +3 -3
- data/lib/nhkore/user_agents.rb +1171 -0
- data/lib/nhkore/version.rb +1 -1
- data/nhkore.gemspec +1 -2
- metadata +7 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6d6942febe79f05d1cbd53e40f9048671ee0462d16e22945a6b7bdb3bb5bb2ae
|
4
|
+
data.tar.gz: 226acf4e93e6b95b475a1d42bc7a0d2dff82d5647613164fc2feba882e24b6c7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 40d4b5513b9a3d22e4969ef7d88c477d4c1dc118165660e751c8c9d742305d74470494597e5e12f61e15af9dc4c3f4e0bccdd82ac7e8488cb2a312a9749ada20
|
7
|
+
data.tar.gz: 00307ecff6363a7f0b80595fc7ec14a01f3e9e303d5d3023e084883a075bed12464a388ca16b6ff6ca143e993bdb5834493d361084452dd61846567173f0ef7f
|
data/CHANGELOG.md
CHANGED
@@ -1,12 +1,18 @@
|
|
1
1
|
# Changelog | NHKore
|
2
2
|
|
3
|
-
|
3
|
+
- [Keep a Changelog v1.0.0](https://keepachangelog.com/en/1.0.0)
|
4
|
+
- [Semantic Versioning v2.0.0](https://semver.org/spec/v2.0.0.html)
|
4
5
|
|
5
|
-
|
6
|
-
|
6
|
+
## [Unreleased]
|
7
|
+
- https://github.com/esotericpig/nhkore/compare/v0.3.22...v0.3
|
7
8
|
|
8
|
-
|
9
|
-
-
|
9
|
+
|
10
|
+
## [v0.3.22] - 2025-04-30
|
11
|
+
|
12
|
+
### Changed
|
13
|
+
- Put v0.3 in its own branch to prepare for v0.4, which will heavily change.
|
14
|
+
- Changed v0.3 links to use v0.3 branch.
|
15
|
+
- Reverted the removing of `UserAgents` for v0.3 only.
|
10
16
|
|
11
17
|
|
12
18
|
## [v0.3.19] - 2025-04-28
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
nhkore (0.3.
|
4
|
+
nhkore (0.3.22)
|
5
5
|
attr_bool (~> 0.2)
|
6
6
|
bimyou_segmenter (~> 1.2)
|
7
7
|
cri (~> 2.15)
|
@@ -14,7 +14,6 @@ PATH
|
|
14
14
|
psychgus (~> 1.3)
|
15
15
|
public_suffix (~> 6.0)
|
16
16
|
rainbow (~> 3.1)
|
17
|
-
ronin-web-user_agents (~> 0.1)
|
18
17
|
rss (~> 0.3)
|
19
18
|
rubyzip (~> 2.4)
|
20
19
|
tiny_segmenter (~> 0.0)
|
@@ -50,7 +49,7 @@ GEM
|
|
50
49
|
stringio
|
51
50
|
psychgus (1.3.5)
|
52
51
|
psych (>= 3.0)
|
53
|
-
public_suffix (6.0.
|
52
|
+
public_suffix (6.0.2)
|
54
53
|
racc (1.8.1)
|
55
54
|
rainbow (3.1.1)
|
56
55
|
rake (13.2.1)
|
@@ -62,7 +61,6 @@ GEM
|
|
62
61
|
reline (0.6.1)
|
63
62
|
io-console (~> 0.5)
|
64
63
|
rexml (3.4.1)
|
65
|
-
ronin-web-user_agents (0.1.1)
|
66
64
|
rss (0.3.1)
|
67
65
|
rexml
|
68
66
|
rubyzip (2.4.1)
|
data/README.md
CHANGED
@@ -732,7 +732,7 @@ if !File.exist?(file)
|
|
732
732
|
end
|
733
733
|
```
|
734
734
|
|
735
|
-
### Util & DatetimeParser
|
735
|
+
### Util, UserAgents, & DatetimeParser
|
736
736
|
|
737
737
|
These provide a variety of useful methods/constants.
|
738
738
|
|
@@ -740,6 +740,7 @@ Here are some of the most useful ones:
|
|
740
740
|
|
741
741
|
```Ruby
|
742
742
|
require 'nhkore/datetime_parser'
|
743
|
+
require 'nhkore/user_agents'
|
743
744
|
require 'nhkore/util'
|
744
745
|
|
745
746
|
include NHKore
|
@@ -747,6 +748,10 @@ include NHKore
|
|
747
748
|
puts '======='
|
748
749
|
puts '[ Net ]'
|
749
750
|
puts '======='
|
751
|
+
# Get a random User Agent for HTTP header field 'User-Agent'.
|
752
|
+
# - This is used by default in Scraper/SearchScraper.
|
753
|
+
puts "User-Agent: #{UserAgents.sample()}"
|
754
|
+
|
750
755
|
uri = URI('https://www.bing.com/search?q=nhk')
|
751
756
|
Util.replace_uri_query!(uri,q: 'banana')
|
752
757
|
|
data/lib/nhkore/cli/get_cmd.rb
CHANGED
@@ -9,6 +9,7 @@
|
|
9
9
|
#++
|
10
10
|
|
11
11
|
require 'nhkore/util'
|
12
|
+
require 'nhkore/version'
|
12
13
|
|
13
14
|
module NHKore
|
14
15
|
module CLI
|
@@ -16,7 +17,8 @@ module CLI
|
|
16
17
|
DEFAULT_GET_CHUNK_SIZE = 4 * 1024
|
17
18
|
DEFAULT_GET_URL_LENGTH = 11_000_000 # Just a generous estimation used as a fallback; may be outdated.
|
18
19
|
GET_URL_FILENAME = 'nhkore-core.zip'
|
19
|
-
GET_URL = "https://github.com/esotericpig/nhkore/releases/
|
20
|
+
GET_URL = "https://github.com/esotericpig/nhkore/releases/download/v#{NHKore::VERSION}" \
|
21
|
+
"/#{GET_URL_FILENAME}".freeze
|
20
22
|
|
21
23
|
def build_get_cmd
|
22
24
|
app = self
|
@@ -36,9 +38,7 @@ module CLI
|
|
36
38
|
DESC
|
37
39
|
|
38
40
|
option :o,:out,'directory to save downloaded files to',argument: :required,default: Util::CORE_DIR,
|
39
|
-
|
40
|
-
app.check_empty_opt(:out,value)
|
41
|
-
}
|
41
|
+
transform: ->(value) { app.check_empty_opt(:out,value) }
|
42
42
|
flag nil,:'show-url','show download URL and exit (for downloading manually)' do |_value,_cmd|
|
43
43
|
puts GET_URL
|
44
44
|
exit
|
@@ -69,7 +69,7 @@ module CLI
|
|
69
69
|
out_dir = @cmd_opts[:out]
|
70
70
|
|
71
71
|
begin
|
72
|
-
start_spin(
|
72
|
+
start_spin("Opening URL: #{GET_URL} ")
|
73
73
|
|
74
74
|
begin
|
75
75
|
down = Down::NetHttp.open(GET_URL,rewindable: false,**@scraper_kargs)
|
data/lib/nhkore/lib.rb
CHANGED
data/lib/nhkore/scraper.rb
CHANGED
@@ -11,9 +11,9 @@
|
|
11
11
|
require 'attr_bool'
|
12
12
|
require 'nokogiri'
|
13
13
|
require 'open-uri'
|
14
|
-
require 'ronin/web/user_agents'
|
15
14
|
|
16
15
|
require 'nhkore/error'
|
16
|
+
require 'nhkore/user_agents'
|
17
17
|
require 'nhkore/util'
|
18
18
|
|
19
19
|
module NHKore
|
@@ -23,7 +23,7 @@ module NHKore
|
|
23
23
|
DEFAULT_HEADER = {
|
24
24
|
# See for better ones:
|
25
25
|
# - https://www.useragentstring.com/pages/Chrome/
|
26
|
-
'user-agent' =>
|
26
|
+
'user-agent' => UserAgents.sample,
|
27
27
|
|
28
28
|
'accept' => 'text/html,application/xhtml+xml,application/xml,application/rss+xml,text/xml;' \
|
29
29
|
'q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
@@ -111,14 +111,14 @@ module NHKore
|
|
111
111
|
super(url,**kargs)
|
112
112
|
end
|
113
113
|
|
114
|
-
# FIXME: Bing no longer allows `count`.
|
115
|
-
# rubocop:disable Lint/UnusedMethodArgument
|
116
114
|
def self.build_url(site,count: DEFAULT_RESULT_COUNT,**_kargs)
|
117
115
|
url = ''.dup
|
118
116
|
|
119
117
|
url << 'https://www.bing.com/search?'
|
120
118
|
url << URI.encode_www_form(
|
121
119
|
q: "site:#{site}",
|
120
|
+
count: count,
|
121
|
+
|
122
122
|
qs: 'n',
|
123
123
|
sp: '-1',
|
124
124
|
lq: '0',
|
@@ -131,15 +131,13 @@ module NHKore
|
|
131
131
|
|
132
132
|
return url
|
133
133
|
end
|
134
|
-
# rubocop:enable Lint/UnusedMethodArgument
|
135
134
|
|
136
135
|
def scrape(slinks,page = NextPage.new())
|
137
|
-
next_page,
|
136
|
+
next_page,link_count = scrape_html(slinks,page)
|
138
137
|
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
# end
|
138
|
+
if link_count <= 0
|
139
|
+
scrape_rss(slinks,page,next_page)
|
140
|
+
end
|
143
141
|
|
144
142
|
return next_page
|
145
143
|
end
|
data/lib/nhkore/sifter.rb
CHANGED
@@ -322,7 +322,7 @@ module NHKore
|
|
322
322
|
end
|
323
323
|
|
324
324
|
def sift
|
325
|
-
|
325
|
+
result = Article.new
|
326
326
|
|
327
327
|
@articles.each do |article|
|
328
328
|
next if filter?(article)
|
@@ -333,11 +333,11 @@ module NHKore
|
|
333
333
|
next if word.freq <= 1
|
334
334
|
next if word.word =~ /\p{Latin}|[[:digit:]]/
|
335
335
|
|
336
|
-
|
336
|
+
result.add_word(word,use_freq: true)
|
337
337
|
end
|
338
338
|
end
|
339
339
|
|
340
|
-
words =
|
340
|
+
words = result.words.values
|
341
341
|
|
342
342
|
words.sort! do |word1,word2|
|
343
343
|
# Order by freq DESC (most frequent words to top).
|