nhkore 0.3.4 → 0.3.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,23 +1,11 @@
1
- #!/usr/bin/env ruby
2
1
  # encoding: UTF-8
3
2
  # frozen_string_literal: true
4
3
 
5
4
  #--
6
5
  # This file is part of NHKore.
7
- # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
- #
9
- # NHKore is free software: you can redistribute it and/or modify
10
- # it under the terms of the GNU Lesser General Public License as published by
11
- # the Free Software Foundation, either version 3 of the License, or
12
- # (at your option) any later version.
13
- #
14
- # NHKore is distributed in the hope that it will be useful,
15
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
- # GNU Lesser General Public License for more details.
18
- #
19
- # You should have received a copy of the GNU Lesser General Public License
20
- # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
6
+ # Copyright (c) 2020-2021 Jonathan Bradley Whited
7
+ #
8
+ # SPDX-License-Identifier: LGPL-3.0-or-later
21
9
  #++
22
10
 
23
11
 
@@ -26,24 +14,24 @@ require 'nhkore/util'
26
14
 
27
15
  module NHKore
28
16
  ###
29
- # @author Jonathan Bradley Whited (@esotericpig)
17
+ # @author Jonathan Bradley Whited
30
18
  # @since 0.2.0
31
19
  ###
32
20
  class Splitter
33
21
  def begin_split(str)
34
22
  return str
35
23
  end
36
-
24
+
37
25
  def split(str)
38
26
  str = begin_split(str)
39
27
  str = end_split(str)
40
-
28
+
41
29
  return str
42
30
  end
43
31
  end
44
-
32
+
45
33
  ###
46
- # @author Jonathan Bradley Whited (@esotericpig)
34
+ # @author Jonathan Bradley Whited
47
35
  # @since 0.2.0
48
36
  ###
49
37
  class BasicSplitter < Splitter
@@ -51,43 +39,43 @@ module NHKore
51
39
  return str.split(Util::NORMALIZE_STR_REGEX)
52
40
  end
53
41
  end
54
-
42
+
55
43
  ###
56
44
  # @since 0.2.0
57
45
  ###
58
46
  class BimyouSplitter < Splitter
59
47
  def initialize(*)
60
48
  require 'bimyou_segmenter'
61
-
49
+
62
50
  super
63
51
  end
64
-
52
+
65
53
  def end_split(str)
66
54
  return BimyouSegmenter.segment(str,symbol: false,white_space: false)
67
55
  end
68
56
  end
69
-
57
+
70
58
  ###
71
59
  # @since 0.2.0
72
60
  ###
73
61
  class TinySplitter < Splitter
74
62
  attr_accessor :tiny
75
-
63
+
76
64
  def initialize(*)
77
65
  require 'tiny_segmenter'
78
-
66
+
79
67
  super
80
-
81
- @tiny = TinySegmenter.new()
68
+
69
+ @tiny = TinySegmenter.new
82
70
  end
83
-
71
+
84
72
  def end_split(str)
85
73
  return @tiny.segment(str,ignore_punctuation: true)
86
74
  end
87
75
  end
88
-
76
+
89
77
  ###
90
- # @author Jonathan Bradley Whited (@esotericpig)
78
+ # @author Jonathan Bradley Whited
91
79
  # @since 0.2.0
92
80
  ###
93
81
  class BestSplitter < BimyouSplitter
@@ -1,39 +1,33 @@
1
- #!/usr/bin/env ruby
2
1
  # encoding: UTF-8
3
2
  # frozen_string_literal: true
4
3
 
5
4
  #--
6
5
  # This file is part of NHKore.
7
- # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
- #
9
- # NHKore is free software: you can redistribute it and/or modify
10
- # it under the terms of the GNU Lesser General Public License as published by
11
- # the Free Software Foundation, either version 3 of the License, or
12
- # (at your option) any later version.
13
- #
14
- # NHKore is distributed in the hope that it will be useful,
15
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
- # GNU Lesser General Public License for more details.
18
- #
19
- # You should have received a copy of the GNU Lesser General Public License
20
- # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
6
+ # Copyright (c) 2020-2021 Jonathan Bradley Whited
7
+ #
8
+ # SPDX-License-Identifier: LGPL-3.0-or-later
21
9
  #++
22
10
 
23
11
 
24
- if $0 == __FILE__
25
- # `gem install 'user-agent-randomizer'` required.
26
-
12
+ if $PROGRAM_NAME == __FILE__
13
+ require 'bundler/inline'
14
+
15
+ gemfile do
16
+ source 'https://rubygems.org'
17
+
18
+ gem 'user-agent-randomizer','~> 0.2',require: false
19
+ end
20
+
27
21
  require 'set'
28
22
  require 'user_agent_randomizer'
29
-
30
- agents = Set.new()
31
-
23
+
24
+ agents = Set.new
25
+
32
26
  while agents.length < 1111
33
27
  agents.add(UserAgentRandomizer::UserAgent.fetch(type: 'desktop_browser').string)
34
28
  end
35
-
36
- agents.each() do |agent|
29
+
30
+ agents.each do |agent|
37
31
  puts "'#{agent}',"
38
32
  end
39
33
  end
@@ -41,26 +35,27 @@ end
41
35
  module NHKore
42
36
  ###
43
37
  # 1111 user-agents produced using the +user-agent-randomizer+ gem.
44
- #
38
+ #
45
39
  # The gem is really old and had a lot of warnings, so decided to make this class.
46
40
  # Maybe I'll fork the gem and maintain a new version in the future...
47
- #
48
- # @author Jonathan Bradley Whited (@esotericpig)
41
+ #
42
+ # @author Jonathan Bradley Whited
49
43
  # @since 0.2.1
50
44
  ###
51
45
  class UserAgents
52
46
  attr_accessor :data
53
-
54
- def self.sample()
55
- return UserAgents.new().data.sample()
47
+
48
+ def self.sample
49
+ return UserAgents.new.data.sample
56
50
  end
57
-
51
+
58
52
  # Decided to store the data in an instance variable (instead of a constant)
59
53
  # because we don't need all of the data in memory after getting just 1
60
54
  # sample, even though it's slower.
61
- def initialize()
55
+ def initialize
62
56
  super()
63
-
57
+
58
+ # rubocop:disable all
64
59
  @data = [
65
60
  'ELinks/0.11.3 (textmode; Darwin 10.2.0 i386; 80x24-2)',
66
61
  'Mozilla/4.0 (compatible; MSIE 6.0; X11; Linux i686; de) Opera 10.10',
@@ -1174,6 +1169,7 @@ module NHKore
1174
1169
  'Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.1a) Gecko/20020611',
1175
1170
  'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0',
1176
1171
  ]
1172
+ # rubocop:enable all
1177
1173
  end
1178
1174
  end
1179
1175
  end
data/lib/nhkore/util.rb CHANGED
@@ -1,23 +1,11 @@
1
- #!/usr/bin/env ruby
2
1
  # encoding: UTF-8
3
2
  # frozen_string_literal: true
4
3
 
5
4
  #--
6
5
  # This file is part of NHKore.
7
- # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
- #
9
- # NHKore is free software: you can redistribute it and/or modify
10
- # it under the terms of the GNU Lesser General Public License as published by
11
- # the Free Software Foundation, either version 3 of the License, or
12
- # (at your option) any later version.
13
- #
14
- # NHKore is distributed in the hope that it will be useful,
15
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
- # GNU Lesser General Public License for more details.
18
- #
19
- # You should have received a copy of the GNU Lesser General Public License
20
- # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
6
+ # Copyright (c) 2020-2021 Jonathan Bradley Whited
7
+ #
8
+ # SPDX-License-Identifier: LGPL-3.0-or-later
21
9
  #++
22
10
 
23
11
 
@@ -29,110 +17,110 @@ require 'uri'
29
17
 
30
18
  module NHKore
31
19
  ###
32
- # @author Jonathan Bradley Whited (@esotericpig)
20
+ # @author Jonathan Bradley Whited
33
21
  # @since 0.2.0
34
22
  ###
35
23
  module Util
36
24
  CORE_DIR = 'core'
37
25
  WEB_DIR = 'web'
38
-
26
+
39
27
  JST_OFFSET = '+09:00' # Japan Standard Time (JST) time zone offset from UTC
40
28
  JST_OFFSET_HOUR = 9
41
29
  JST_OFFSET_MIN = 0
42
-
43
- HIRAGANA_REGEX = /\p{Hiragana}/
30
+
31
+ HIRAGANA_REGEX = /\p{Hiragana}/.freeze
44
32
  JPN_SPACE = "\u3000" # Must be double-quoted for escape chars
45
- KANA_REGEX = /\p{Hiragana}|\p{Katakana}/
46
- KANJI_REGEX = /\p{Han}/ # Han probably stands for Hanzi?
47
- KATAKANA_REGEX = /\p{Katakana}/
48
- NORMALIZE_STR_REGEX = /[^[[:alpha:]]]+/
49
- STRIP_WEB_STR_REGEX = /(\A[[:space:]]+)|([[:space:]]+\z)/
50
- WEB_SPACES_REGEX = /[[:space:]]+/
51
-
52
- def self.jst_now()
53
- return Time.now().getlocal(JST_OFFSET)
54
- end
55
-
56
- JST_YEAR = jst_now().year
33
+ KANA_REGEX = /\p{Hiragana}|\p{Katakana}/.freeze
34
+ KANJI_REGEX = /\p{Han}/.freeze # Han probably stands for Hanzi?
35
+ KATAKANA_REGEX = /\p{Katakana}/.freeze
36
+ NORMALIZE_STR_REGEX = /[^[[:alpha:]]]+/.freeze
37
+ STRIP_WEB_STR_REGEX = /(\A[[:space:]]+)|([[:space:]]+\z)/.freeze
38
+ WEB_SPACES_REGEX = /[[:space:]]+/.freeze
39
+
40
+ def self.jst_now
41
+ return Time.now.getlocal(JST_OFFSET)
42
+ end
43
+
44
+ JST_YEAR = jst_now.year
57
45
  MAX_SANE_YEAR = JST_YEAR + 1 # +1 Justin Case for time zone differences at the end of the year
58
-
46
+
59
47
  # NHK was founded in 1924/25.
60
48
  # - https://www.nhk.or.jp/bunken/english/about/history.html
61
49
  # - https://en.wikipedia.org/wiki/NHK
62
50
  # However, when was the website first created?
63
51
  MIN_SANE_YEAR = 1924
64
-
52
+
65
53
  def self.dir_str?(str)
66
- return str.match?(/[\/\\]\s*\z/)
54
+ return str.match?(%r{[/\\]\s*\z/})
67
55
  end
68
-
56
+
69
57
  def self.domain(host,clean: true)
70
58
  require 'public_suffix'
71
-
59
+
72
60
  domain = PublicSuffix.domain(host)
73
- domain = unspace_web_str(domain).downcase() if !domain.nil?() && clean
74
-
61
+ domain = unspace_web_str(domain).downcase if !domain.nil? && clean
62
+
75
63
  return domain
76
64
  end
77
-
65
+
78
66
  def self.dump_yaml(obj,flow_level: 8,stylers: nil)
79
67
  require 'psychgus'
80
-
68
+
81
69
  stylers = Array(stylers)
82
-
70
+
83
71
  return Psychgus.dump(obj,
84
72
  deref_aliases: true, # Dereference aliases for load_yaml()
85
73
  header: true, # %YAML [version]
86
- line_width: 10000, # Try not to wrap; ichiman!
74
+ line_width: 10_000, # Try not to wrap; ichiman!
87
75
  stylers: [
88
76
  Psychgus::FlowStyler.new(flow_level), # Put extra details on one line (flow/inline style)
89
77
  Psychgus::NoSymStyler.new(cap: false), # Remove symbols, don't capitalize
90
- Psychgus::NoTagStyler.new(), # Remove class names (tags)
78
+ Psychgus::NoTagStyler.new, # Remove class names (tags)
91
79
  ].concat(stylers),
92
80
  )
93
81
  end
94
-
82
+
95
83
  def self.empty_web_str?(str)
96
- return str.nil?() || strip_web_str(str).empty?()
84
+ return str.nil? || strip_web_str(str).empty?
97
85
  end
98
-
86
+
99
87
  def self.escape_html(str)
100
88
  str = CGI.escapeHTML(str)
101
89
  str = str.gsub("\n",'<br>')
102
-
90
+
103
91
  return str
104
92
  end
105
-
93
+
106
94
  def self.filename_str?(str)
107
95
  # Do not use "!dir_str?()"! It's not the same meaning!
108
- return !str.match?(/[\/\\]/)
96
+ return !str.match?(%r{[/\\]})
109
97
  end
110
-
98
+
111
99
  def self.hiragana?(str)
112
100
  return HIRAGANA_REGEX =~ str
113
101
  end
114
-
102
+
115
103
  # This doesn't modify the hour/minute according to {JST_OFFSET},
116
104
  # but instead, it just drops {JST_OFFSET} into it without adjusting it.
117
105
  def self.jst_time(time)
118
106
  return Time.new(time.year,time.month,time.day,time.hour,time.min,time.sec,JST_OFFSET)
119
107
  end
120
-
108
+
121
109
  def self.kana?(str)
122
110
  return KANA_REGEX =~ str
123
111
  end
124
-
112
+
125
113
  def self.kanji?(str)
126
114
  return KANJI_REGEX =~ str
127
115
  end
128
-
116
+
129
117
  def self.katakana?(str)
130
118
  return KATAKANA_REGEX =~ str
131
119
  end
132
-
120
+
133
121
  def self.load_yaml(data,file: nil,**kargs)
134
122
  require 'psychgus'
135
-
123
+
136
124
  return Psych.safe_load(data,
137
125
  aliases: false,
138
126
  filename: file,
@@ -142,72 +130,72 @@ module NHKore
142
130
  **kargs,
143
131
  )
144
132
  end
145
-
133
+
146
134
  def self.normalize_str(str)
147
135
  return str.gsub(NORMALIZE_STR_REGEX,'')
148
136
  end
149
-
137
+
150
138
  def self.reduce_jpn_space(str)
151
139
  # Do not strip; use a Japanese space
152
140
  return str.gsub(WEB_SPACES_REGEX,JPN_SPACE)
153
141
  end
154
-
142
+
155
143
  def self.reduce_space(str)
156
144
  return str.gsub(WEB_SPACES_REGEX,' ')
157
145
  end
158
-
146
+
159
147
  def self.replace_uri_query!(uri,**new_query)
160
- return uri if new_query.empty?()
161
-
148
+ return uri if new_query.empty?
149
+
162
150
  query = uri.query
163
- query = query.nil?() ? [] : URI.decode_www_form(query)
164
-
151
+ query = query.nil? ? [] : URI.decode_www_form(query)
152
+
165
153
  # First, remove the old ones.
166
- if !query.empty?()
167
- new_query_keys = Set.new(new_query.keys.map() {|key|
168
- unspace_web_str(key.to_s()).downcase()
169
- })
170
-
171
- query.filter!() do |q|
172
- if q.nil?() || q.empty?()
154
+ if !query.empty?
155
+ new_query_keys = Set.new(new_query.keys.map do |key|
156
+ unspace_web_str(key.to_s).downcase
157
+ end)
158
+
159
+ query.filter! do |q|
160
+ if q.nil? || q.empty?
173
161
  false
174
162
  else
175
- key = unspace_web_str(q[0].to_s()).downcase()
176
-
163
+ key = unspace_web_str(q[0].to_s).downcase
164
+
177
165
  !new_query_keys.include?(key)
178
166
  end
179
167
  end
180
168
  end
181
-
169
+
182
170
  # Next, add the new ones.
183
- new_query.each() do |key,value|
184
- query << [key,value.nil?() ? '' : value]
171
+ new_query.each do |key,value|
172
+ query << [key,value.nil? ? '' : value]
185
173
  end
186
-
174
+
187
175
  uri.query = URI.encode_www_form(query)
188
-
176
+
189
177
  return uri
190
178
  end
191
-
179
+
192
180
  def self.sane_year?(year)
193
181
  return year >= MIN_SANE_YEAR && year <= MAX_SANE_YEAR
194
182
  end
195
-
183
+
196
184
  # String's normal strip() method doesn't work with special Unicode/HTML white space.
197
185
  def self.strip_web_str(str)
198
186
  # After testing with Benchmark, this is slower than one regex.
199
187
  #str = str.gsub(/\A[[:space:]]+/,'')
200
188
  #str = str.gsub(/[[:space:]]+\z/,'')
201
-
189
+
202
190
  str = str.gsub(STRIP_WEB_STR_REGEX,'')
203
-
191
+
204
192
  return str
205
193
  end
206
-
194
+
207
195
  def self.unspace_web_str(str)
208
196
  return str.gsub(WEB_SPACES_REGEX,'')
209
197
  end
210
-
198
+
211
199
  def self.warn(msg,uplevel: 1)
212
200
  Kernel.warn(msg,uplevel: uplevel)
213
201
  end