nhkore 0.3.3 → 0.3.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +3 -0
- data/CHANGELOG.md +97 -2
- data/Gemfile +0 -18
- data/Gemfile.lock +89 -0
- data/README.md +58 -30
- data/Rakefile +68 -42
- data/bin/nhkore +4 -15
- data/lib/nhkore.rb +8 -20
- data/lib/nhkore/app.rb +231 -236
- data/lib/nhkore/article.rb +56 -53
- data/lib/nhkore/article_scraper.rb +308 -289
- data/lib/nhkore/cleaner.rb +20 -32
- data/lib/nhkore/cli/fx_cmd.rb +41 -53
- data/lib/nhkore/cli/get_cmd.rb +59 -70
- data/lib/nhkore/cli/news_cmd.rb +145 -154
- data/lib/nhkore/cli/search_cmd.rb +110 -120
- data/lib/nhkore/cli/sift_cmd.rb +111 -227
- data/lib/nhkore/datetime_parser.rb +328 -0
- data/lib/nhkore/defn.rb +48 -55
- data/lib/nhkore/dict.rb +26 -38
- data/lib/nhkore/dict_scraper.rb +31 -40
- data/lib/nhkore/entry.rb +43 -55
- data/lib/nhkore/error.rb +16 -21
- data/lib/nhkore/fileable.rb +10 -21
- data/lib/nhkore/lib.rb +6 -17
- data/lib/nhkore/missingno.rb +21 -33
- data/lib/nhkore/news.rb +61 -66
- data/lib/nhkore/polisher.rb +22 -34
- data/lib/nhkore/scraper.rb +75 -82
- data/lib/nhkore/search_link.rb +85 -78
- data/lib/nhkore/search_scraper.rb +89 -92
- data/lib/nhkore/sifter.rb +157 -171
- data/lib/nhkore/splitter.rb +19 -31
- data/lib/nhkore/user_agents.rb +28 -32
- data/lib/nhkore/util.rb +72 -101
- data/lib/nhkore/variator.rb +20 -32
- data/lib/nhkore/version.rb +4 -16
- data/lib/nhkore/word.rb +105 -99
- data/nhkore.gemspec +58 -65
- data/samples/looper.rb +71 -0
- data/test/nhkore/test_helper.rb +3 -15
- data/test/nhkore_test.rb +6 -18
- metadata +53 -30
data/lib/nhkore/splitter.rb
CHANGED
@@ -1,23 +1,11 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
1
|
# encoding: UTF-8
|
3
2
|
# frozen_string_literal: true
|
4
3
|
|
5
4
|
#--
|
6
5
|
# This file is part of NHKore.
|
7
|
-
# Copyright (c) 2020 Jonathan Bradley Whited
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
-
# the Free Software Foundation, either version 3 of the License, or
|
12
|
-
# (at your option) any later version.
|
13
|
-
#
|
14
|
-
# NHKore is distributed in the hope that it will be useful,
|
15
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
-
# GNU Lesser General Public License for more details.
|
18
|
-
#
|
19
|
-
# You should have received a copy of the GNU Lesser General Public License
|
20
|
-
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
6
|
+
# Copyright (c) 2020-2021 Jonathan Bradley Whited
|
7
|
+
#
|
8
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
21
9
|
#++
|
22
10
|
|
23
11
|
|
@@ -26,24 +14,24 @@ require 'nhkore/util'
|
|
26
14
|
|
27
15
|
module NHKore
|
28
16
|
###
|
29
|
-
# @author Jonathan Bradley Whited
|
17
|
+
# @author Jonathan Bradley Whited
|
30
18
|
# @since 0.2.0
|
31
19
|
###
|
32
20
|
class Splitter
|
33
21
|
def begin_split(str)
|
34
22
|
return str
|
35
23
|
end
|
36
|
-
|
24
|
+
|
37
25
|
def split(str)
|
38
26
|
str = begin_split(str)
|
39
27
|
str = end_split(str)
|
40
|
-
|
28
|
+
|
41
29
|
return str
|
42
30
|
end
|
43
31
|
end
|
44
|
-
|
32
|
+
|
45
33
|
###
|
46
|
-
# @author Jonathan Bradley Whited
|
34
|
+
# @author Jonathan Bradley Whited
|
47
35
|
# @since 0.2.0
|
48
36
|
###
|
49
37
|
class BasicSplitter < Splitter
|
@@ -51,43 +39,43 @@ module NHKore
|
|
51
39
|
return str.split(Util::NORMALIZE_STR_REGEX)
|
52
40
|
end
|
53
41
|
end
|
54
|
-
|
42
|
+
|
55
43
|
###
|
56
44
|
# @since 0.2.0
|
57
45
|
###
|
58
46
|
class BimyouSplitter < Splitter
|
59
47
|
def initialize(*)
|
60
48
|
require 'bimyou_segmenter'
|
61
|
-
|
49
|
+
|
62
50
|
super
|
63
51
|
end
|
64
|
-
|
52
|
+
|
65
53
|
def end_split(str)
|
66
54
|
return BimyouSegmenter.segment(str,symbol: false,white_space: false)
|
67
55
|
end
|
68
56
|
end
|
69
|
-
|
57
|
+
|
70
58
|
###
|
71
59
|
# @since 0.2.0
|
72
60
|
###
|
73
61
|
class TinySplitter < Splitter
|
74
62
|
attr_accessor :tiny
|
75
|
-
|
63
|
+
|
76
64
|
def initialize(*)
|
77
65
|
require 'tiny_segmenter'
|
78
|
-
|
66
|
+
|
79
67
|
super
|
80
|
-
|
81
|
-
@tiny = TinySegmenter.new
|
68
|
+
|
69
|
+
@tiny = TinySegmenter.new
|
82
70
|
end
|
83
|
-
|
71
|
+
|
84
72
|
def end_split(str)
|
85
73
|
return @tiny.segment(str,ignore_punctuation: true)
|
86
74
|
end
|
87
75
|
end
|
88
|
-
|
76
|
+
|
89
77
|
###
|
90
|
-
# @author Jonathan Bradley Whited
|
78
|
+
# @author Jonathan Bradley Whited
|
91
79
|
# @since 0.2.0
|
92
80
|
###
|
93
81
|
class BestSplitter < BimyouSplitter
|
data/lib/nhkore/user_agents.rb
CHANGED
@@ -1,39 +1,33 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
1
|
# encoding: UTF-8
|
3
2
|
# frozen_string_literal: true
|
4
3
|
|
5
4
|
#--
|
6
5
|
# This file is part of NHKore.
|
7
|
-
# Copyright (c) 2020 Jonathan Bradley Whited
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
-
# the Free Software Foundation, either version 3 of the License, or
|
12
|
-
# (at your option) any later version.
|
13
|
-
#
|
14
|
-
# NHKore is distributed in the hope that it will be useful,
|
15
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
-
# GNU Lesser General Public License for more details.
|
18
|
-
#
|
19
|
-
# You should have received a copy of the GNU Lesser General Public License
|
20
|
-
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
6
|
+
# Copyright (c) 2020-2021 Jonathan Bradley Whited
|
7
|
+
#
|
8
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
21
9
|
#++
|
22
10
|
|
23
11
|
|
24
|
-
if $
|
25
|
-
|
26
|
-
|
12
|
+
if $PROGRAM_NAME == __FILE__
|
13
|
+
require 'bundler/inline'
|
14
|
+
|
15
|
+
gemfile do
|
16
|
+
source 'https://rubygems.org'
|
17
|
+
|
18
|
+
gem 'user-agent-randomizer','~> 0.2',require: false
|
19
|
+
end
|
20
|
+
|
27
21
|
require 'set'
|
28
22
|
require 'user_agent_randomizer'
|
29
|
-
|
30
|
-
agents = Set.new
|
31
|
-
|
23
|
+
|
24
|
+
agents = Set.new
|
25
|
+
|
32
26
|
while agents.length < 1111
|
33
27
|
agents.add(UserAgentRandomizer::UserAgent.fetch(type: 'desktop_browser').string)
|
34
28
|
end
|
35
|
-
|
36
|
-
agents.each
|
29
|
+
|
30
|
+
agents.each do |agent|
|
37
31
|
puts "'#{agent}',"
|
38
32
|
end
|
39
33
|
end
|
@@ -41,26 +35,27 @@ end
|
|
41
35
|
module NHKore
|
42
36
|
###
|
43
37
|
# 1111 user-agents produced using the +user-agent-randomizer+ gem.
|
44
|
-
#
|
38
|
+
#
|
45
39
|
# The gem is really old and had a lot of warnings, so decided to make this class.
|
46
40
|
# Maybe I'll fork the gem and maintain a new version in the future...
|
47
|
-
#
|
48
|
-
# @author Jonathan Bradley Whited
|
41
|
+
#
|
42
|
+
# @author Jonathan Bradley Whited
|
49
43
|
# @since 0.2.1
|
50
44
|
###
|
51
45
|
class UserAgents
|
52
46
|
attr_accessor :data
|
53
|
-
|
54
|
-
def self.sample
|
55
|
-
return UserAgents.new
|
47
|
+
|
48
|
+
def self.sample
|
49
|
+
return UserAgents.new.data.sample
|
56
50
|
end
|
57
|
-
|
51
|
+
|
58
52
|
# Decided to store the data in an instance variable (instead of a constant)
|
59
53
|
# because we don't need all of the data in memory after getting just 1
|
60
54
|
# sample, even though it's slower.
|
61
|
-
def initialize
|
55
|
+
def initialize
|
62
56
|
super()
|
63
|
-
|
57
|
+
|
58
|
+
# rubocop:disable all
|
64
59
|
@data = [
|
65
60
|
'ELinks/0.11.3 (textmode; Darwin 10.2.0 i386; 80x24-2)',
|
66
61
|
'Mozilla/4.0 (compatible; MSIE 6.0; X11; Linux i686; de) Opera 10.10',
|
@@ -1174,6 +1169,7 @@ module NHKore
|
|
1174
1169
|
'Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.1a) Gecko/20020611',
|
1175
1170
|
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0',
|
1176
1171
|
]
|
1172
|
+
# rubocop:enable all
|
1177
1173
|
end
|
1178
1174
|
end
|
1179
1175
|
end
|
data/lib/nhkore/util.rb
CHANGED
@@ -1,23 +1,11 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
1
|
# encoding: UTF-8
|
3
2
|
# frozen_string_literal: true
|
4
3
|
|
5
4
|
#--
|
6
5
|
# This file is part of NHKore.
|
7
|
-
# Copyright (c) 2020 Jonathan Bradley Whited
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
-
# the Free Software Foundation, either version 3 of the License, or
|
12
|
-
# (at your option) any later version.
|
13
|
-
#
|
14
|
-
# NHKore is distributed in the hope that it will be useful,
|
15
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
-
# GNU Lesser General Public License for more details.
|
18
|
-
#
|
19
|
-
# You should have received a copy of the GNU Lesser General Public License
|
20
|
-
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
6
|
+
# Copyright (c) 2020-2021 Jonathan Bradley Whited
|
7
|
+
#
|
8
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
21
9
|
#++
|
22
10
|
|
23
11
|
|
@@ -29,127 +17,110 @@ require 'uri'
|
|
29
17
|
|
30
18
|
module NHKore
|
31
19
|
###
|
32
|
-
# @author Jonathan Bradley Whited
|
20
|
+
# @author Jonathan Bradley Whited
|
33
21
|
# @since 0.2.0
|
34
22
|
###
|
35
23
|
module Util
|
36
24
|
CORE_DIR = 'core'
|
37
25
|
WEB_DIR = 'web'
|
38
|
-
|
26
|
+
|
39
27
|
JST_OFFSET = '+09:00' # Japan Standard Time (JST) time zone offset from UTC
|
40
28
|
JST_OFFSET_HOUR = 9
|
41
29
|
JST_OFFSET_MIN = 0
|
42
|
-
|
43
|
-
HIRAGANA_REGEX = /\p{Hiragana}
|
30
|
+
|
31
|
+
HIRAGANA_REGEX = /\p{Hiragana}/.freeze
|
44
32
|
JPN_SPACE = "\u3000" # Must be double-quoted for escape chars
|
45
|
-
KANA_REGEX = /\p{Hiragana}|\p{Katakana}
|
46
|
-
KANJI_REGEX = /\p{Han}
|
47
|
-
KATAKANA_REGEX = /\p{Katakana}
|
48
|
-
NORMALIZE_STR_REGEX = /[^[[:alpha:]]]
|
49
|
-
STRIP_WEB_STR_REGEX = /(\A[[:space:]]+)|([[:space:]]+\z)
|
50
|
-
WEB_SPACES_REGEX = /[[:space:]]
|
51
|
-
|
52
|
-
def self.jst_now
|
53
|
-
return Time.now
|
54
|
-
end
|
55
|
-
|
56
|
-
JST_YEAR = jst_now
|
33
|
+
KANA_REGEX = /\p{Hiragana}|\p{Katakana}/.freeze
|
34
|
+
KANJI_REGEX = /\p{Han}/.freeze # Han probably stands for Hanzi?
|
35
|
+
KATAKANA_REGEX = /\p{Katakana}/.freeze
|
36
|
+
NORMALIZE_STR_REGEX = /[^[[:alpha:]]]+/.freeze
|
37
|
+
STRIP_WEB_STR_REGEX = /(\A[[:space:]]+)|([[:space:]]+\z)/.freeze
|
38
|
+
WEB_SPACES_REGEX = /[[:space:]]+/.freeze
|
39
|
+
|
40
|
+
def self.jst_now
|
41
|
+
return Time.now.getlocal(JST_OFFSET)
|
42
|
+
end
|
43
|
+
|
44
|
+
JST_YEAR = jst_now.year
|
57
45
|
MAX_SANE_YEAR = JST_YEAR + 1 # +1 Justin Case for time zone differences at the end of the year
|
58
|
-
|
46
|
+
|
59
47
|
# NHK was founded in 1924/25.
|
60
48
|
# - https://www.nhk.or.jp/bunken/english/about/history.html
|
61
49
|
# - https://en.wikipedia.org/wiki/NHK
|
62
50
|
# However, when was the website first created?
|
63
51
|
MIN_SANE_YEAR = 1924
|
64
|
-
|
52
|
+
|
65
53
|
def self.dir_str?(str)
|
66
|
-
return str.match?(
|
54
|
+
return str.match?(%r{[/\\]\s*\z/})
|
67
55
|
end
|
68
|
-
|
56
|
+
|
69
57
|
def self.domain(host,clean: true)
|
70
58
|
require 'public_suffix'
|
71
|
-
|
59
|
+
|
72
60
|
domain = PublicSuffix.domain(host)
|
73
|
-
domain = unspace_web_str(domain).downcase
|
74
|
-
|
61
|
+
domain = unspace_web_str(domain).downcase if !domain.nil? && clean
|
62
|
+
|
75
63
|
return domain
|
76
64
|
end
|
77
|
-
|
65
|
+
|
78
66
|
def self.dump_yaml(obj,flow_level: 8,stylers: nil)
|
79
67
|
require 'psychgus'
|
80
|
-
|
68
|
+
|
81
69
|
stylers = Array(stylers)
|
82
|
-
|
70
|
+
|
83
71
|
return Psychgus.dump(obj,
|
84
72
|
deref_aliases: true, # Dereference aliases for load_yaml()
|
85
73
|
header: true, # %YAML [version]
|
86
|
-
line_width:
|
74
|
+
line_width: 10_000, # Try not to wrap; ichiman!
|
87
75
|
stylers: [
|
88
76
|
Psychgus::FlowStyler.new(flow_level), # Put extra details on one line (flow/inline style)
|
89
77
|
Psychgus::NoSymStyler.new(cap: false), # Remove symbols, don't capitalize
|
90
|
-
Psychgus::NoTagStyler.new
|
78
|
+
Psychgus::NoTagStyler.new, # Remove class names (tags)
|
91
79
|
].concat(stylers),
|
92
80
|
)
|
93
81
|
end
|
94
|
-
|
82
|
+
|
95
83
|
def self.empty_web_str?(str)
|
96
|
-
return str.nil?
|
84
|
+
return str.nil? || strip_web_str(str).empty?
|
97
85
|
end
|
98
|
-
|
86
|
+
|
99
87
|
def self.escape_html(str)
|
100
88
|
str = CGI.escapeHTML(str)
|
101
89
|
str = str.gsub("\n",'<br>')
|
102
|
-
|
90
|
+
|
103
91
|
return str
|
104
92
|
end
|
105
|
-
|
93
|
+
|
106
94
|
def self.filename_str?(str)
|
107
95
|
# Do not use "!dir_str?()"! It's not the same meaning!
|
108
|
-
return !str.match?(
|
109
|
-
end
|
110
|
-
|
111
|
-
def self.guess_year(year)
|
112
|
-
if year < 100
|
113
|
-
# 2021 -> 2000.
|
114
|
-
millennium = JST_YEAR / 100 * 100
|
115
|
-
|
116
|
-
# If year <= (2021 -> 21), assume this century.
|
117
|
-
if year <= (JST_YEAR % 100)
|
118
|
-
year = millennium + year
|
119
|
-
else
|
120
|
-
# Assume previous century (2000 -> 1900).
|
121
|
-
year = (millennium - 100) + year
|
122
|
-
end
|
123
|
-
end
|
124
|
-
|
125
|
-
return year
|
96
|
+
return !str.match?(%r{[/\\]})
|
126
97
|
end
|
127
|
-
|
98
|
+
|
128
99
|
def self.hiragana?(str)
|
129
100
|
return HIRAGANA_REGEX =~ str
|
130
101
|
end
|
131
|
-
|
102
|
+
|
132
103
|
# This doesn't modify the hour/minute according to {JST_OFFSET},
|
133
104
|
# but instead, it just drops {JST_OFFSET} into it without adjusting it.
|
134
105
|
def self.jst_time(time)
|
135
106
|
return Time.new(time.year,time.month,time.day,time.hour,time.min,time.sec,JST_OFFSET)
|
136
107
|
end
|
137
|
-
|
108
|
+
|
138
109
|
def self.kana?(str)
|
139
110
|
return KANA_REGEX =~ str
|
140
111
|
end
|
141
|
-
|
112
|
+
|
142
113
|
def self.kanji?(str)
|
143
114
|
return KANJI_REGEX =~ str
|
144
115
|
end
|
145
|
-
|
116
|
+
|
146
117
|
def self.katakana?(str)
|
147
118
|
return KATAKANA_REGEX =~ str
|
148
119
|
end
|
149
|
-
|
120
|
+
|
150
121
|
def self.load_yaml(data,file: nil,**kargs)
|
151
122
|
require 'psychgus'
|
152
|
-
|
123
|
+
|
153
124
|
return Psych.safe_load(data,
|
154
125
|
aliases: false,
|
155
126
|
filename: file,
|
@@ -159,72 +130,72 @@ module NHKore
|
|
159
130
|
**kargs,
|
160
131
|
)
|
161
132
|
end
|
162
|
-
|
133
|
+
|
163
134
|
def self.normalize_str(str)
|
164
135
|
return str.gsub(NORMALIZE_STR_REGEX,'')
|
165
136
|
end
|
166
|
-
|
137
|
+
|
167
138
|
def self.reduce_jpn_space(str)
|
168
139
|
# Do not strip; use a Japanese space
|
169
140
|
return str.gsub(WEB_SPACES_REGEX,JPN_SPACE)
|
170
141
|
end
|
171
|
-
|
142
|
+
|
172
143
|
def self.reduce_space(str)
|
173
144
|
return str.gsub(WEB_SPACES_REGEX,' ')
|
174
145
|
end
|
175
|
-
|
146
|
+
|
176
147
|
def self.replace_uri_query!(uri,**new_query)
|
177
|
-
return uri if new_query.empty?
|
178
|
-
|
148
|
+
return uri if new_query.empty?
|
149
|
+
|
179
150
|
query = uri.query
|
180
|
-
query = query.nil?
|
181
|
-
|
151
|
+
query = query.nil? ? [] : URI.decode_www_form(query)
|
152
|
+
|
182
153
|
# First, remove the old ones.
|
183
|
-
if !query.empty?
|
184
|
-
new_query_keys = Set.new(new_query.keys.map
|
185
|
-
unspace_web_str(key.to_s
|
186
|
-
|
187
|
-
|
188
|
-
query.filter!
|
189
|
-
if q.nil?
|
154
|
+
if !query.empty?
|
155
|
+
new_query_keys = Set.new(new_query.keys.map do |key|
|
156
|
+
unspace_web_str(key.to_s).downcase
|
157
|
+
end)
|
158
|
+
|
159
|
+
query.filter! do |q|
|
160
|
+
if q.nil? || q.empty?
|
190
161
|
false
|
191
162
|
else
|
192
|
-
key = unspace_web_str(q[0].to_s
|
193
|
-
|
163
|
+
key = unspace_web_str(q[0].to_s).downcase
|
164
|
+
|
194
165
|
!new_query_keys.include?(key)
|
195
166
|
end
|
196
167
|
end
|
197
168
|
end
|
198
|
-
|
169
|
+
|
199
170
|
# Next, add the new ones.
|
200
|
-
new_query.each
|
201
|
-
query << [key,value.nil?
|
171
|
+
new_query.each do |key,value|
|
172
|
+
query << [key,value.nil? ? '' : value]
|
202
173
|
end
|
203
|
-
|
174
|
+
|
204
175
|
uri.query = URI.encode_www_form(query)
|
205
|
-
|
176
|
+
|
206
177
|
return uri
|
207
178
|
end
|
208
|
-
|
179
|
+
|
209
180
|
def self.sane_year?(year)
|
210
181
|
return year >= MIN_SANE_YEAR && year <= MAX_SANE_YEAR
|
211
182
|
end
|
212
|
-
|
183
|
+
|
213
184
|
# String's normal strip() method doesn't work with special Unicode/HTML white space.
|
214
185
|
def self.strip_web_str(str)
|
215
186
|
# After testing with Benchmark, this is slower than one regex.
|
216
187
|
#str = str.gsub(/\A[[:space:]]+/,'')
|
217
188
|
#str = str.gsub(/[[:space:]]+\z/,'')
|
218
|
-
|
189
|
+
|
219
190
|
str = str.gsub(STRIP_WEB_STR_REGEX,'')
|
220
|
-
|
191
|
+
|
221
192
|
return str
|
222
193
|
end
|
223
|
-
|
194
|
+
|
224
195
|
def self.unspace_web_str(str)
|
225
196
|
return str.gsub(WEB_SPACES_REGEX,'')
|
226
197
|
end
|
227
|
-
|
198
|
+
|
228
199
|
def self.warn(msg,uplevel: 1)
|
229
200
|
Kernel.warn(msg,uplevel: uplevel)
|
230
201
|
end
|