nhkore 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +37 -1
- data/README.md +18 -6
- data/Rakefile +11 -16
- data/bin/nhkore +1 -3
- data/lib/nhkore/app.rb +616 -0
- data/lib/nhkore/article.rb +130 -0
- data/lib/nhkore/article_scraper.rb +653 -0
- data/lib/nhkore/cleaner.rb +91 -0
- data/lib/nhkore/cli/bing_cmd.rb +220 -0
- data/lib/nhkore/cli/fx_cmd.rb +116 -0
- data/lib/nhkore/cli/get_cmd.rb +153 -0
- data/lib/nhkore/cli/news_cmd.rb +375 -0
- data/lib/nhkore/cli/sift_cmd.rb +382 -0
- data/lib/nhkore/defn.rb +104 -0
- data/lib/nhkore/dict.rb +80 -0
- data/lib/nhkore/dict_scraper.rb +76 -0
- data/lib/nhkore/entry.rb +104 -0
- data/lib/nhkore/error.rb +35 -0
- data/lib/nhkore/fileable.rb +48 -0
- data/lib/nhkore/missingno.rb +92 -0
- data/lib/nhkore/news.rb +176 -0
- data/lib/nhkore/polisher.rb +93 -0
- data/lib/nhkore/scraper.rb +137 -0
- data/lib/nhkore/search_link.rb +188 -0
- data/lib/nhkore/search_scraper.rb +152 -0
- data/lib/nhkore/sifter.rb +339 -0
- data/lib/nhkore/splitter.rb +90 -0
- data/lib/nhkore/util.rb +190 -0
- data/lib/nhkore/variator.rb +87 -0
- data/lib/nhkore/version.rb +1 -1
- data/lib/nhkore/word.rb +134 -17
- data/lib/nhkore.rb +39 -40
- data/nhkore.gemspec +23 -8
- data/test/{nhkore_tester.rb → nhkore/test_helper.rb} +3 -1
- data/test/nhkore_test.rb +8 -6
- metadata +204 -11
data/lib/nhkore/entry.rb
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'nhkore/defn'
|
25
|
+
require 'nhkore/util'
|
26
|
+
|
27
|
+
|
28
|
+
module NHKore
|
29
|
+
###
|
30
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
31
|
+
# @since 0.2.0
|
32
|
+
###
|
33
|
+
class Entry
|
34
|
+
HYOUKI_SEP = '・'
|
35
|
+
|
36
|
+
attr_reader :defns
|
37
|
+
attr_accessor :id
|
38
|
+
|
39
|
+
def initialize()
|
40
|
+
super()
|
41
|
+
|
42
|
+
@defns = []
|
43
|
+
@id = nil
|
44
|
+
end
|
45
|
+
|
46
|
+
def build_defn()
|
47
|
+
defns = []
|
48
|
+
i = 0
|
49
|
+
|
50
|
+
@defns.each() do |defn|
|
51
|
+
defns << "#{i += 1})#{defn}" # Japanese parenthesis
|
52
|
+
end
|
53
|
+
|
54
|
+
return defns.join("\n")
|
55
|
+
end
|
56
|
+
|
57
|
+
def build_hyouki()
|
58
|
+
# Since Ruby v1.9, Hash preserves order.
|
59
|
+
# Ruby v2.7 doc for Set still says no guarantee of order, so don't use.
|
60
|
+
hyoukis = {}
|
61
|
+
|
62
|
+
@defns.each() do |defn|
|
63
|
+
defn.hyoukis.each() do |hyouki|
|
64
|
+
hyouki = hyouki.chomp(HYOUKI_SEP)
|
65
|
+
|
66
|
+
next if hyouki.empty?()
|
67
|
+
|
68
|
+
hyoukis[hyouki] = true
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
return hyoukis.keys.join(HYOUKI_SEP)
|
73
|
+
end
|
74
|
+
|
75
|
+
def self.scrape(id,array,missingno: nil,url: nil)
|
76
|
+
entry = Entry.new()
|
77
|
+
|
78
|
+
entry.id = Util.unspace_web_str(id.to_s()).downcase()
|
79
|
+
|
80
|
+
return nil if entry.id.empty?()
|
81
|
+
|
82
|
+
array.each() do |hash|
|
83
|
+
defn = Defn.scrape(hash,missingno: missingno,url: url)
|
84
|
+
entry.defns << defn unless defn.nil?()
|
85
|
+
end
|
86
|
+
|
87
|
+
return nil if entry.defns.empty?()
|
88
|
+
return entry
|
89
|
+
end
|
90
|
+
|
91
|
+
def to_s()
|
92
|
+
s = ''.dup()
|
93
|
+
|
94
|
+
return s if @defns.empty?()
|
95
|
+
|
96
|
+
hyouki = build_hyouki()
|
97
|
+
|
98
|
+
s << "#{hyouki}\n" unless Util.empty_web_str?(hyouki)
|
99
|
+
s << build_defn()
|
100
|
+
|
101
|
+
return s
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
data/lib/nhkore/error.rb
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
module NHKore
|
25
|
+
###
|
26
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
27
|
+
# @since 0.2.0
|
28
|
+
###
|
29
|
+
class Error < ::StandardError; end
|
30
|
+
|
31
|
+
class CLIError < Error; end # @since 0.2.0
|
32
|
+
class ParseError < Error; end # @since 0.2.0
|
33
|
+
class ScrapeError < Error; end # @since 0.2.0
|
34
|
+
class ZipError < Error; end # @since 0.2.0
|
35
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
module NHKore
|
25
|
+
###
|
26
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
27
|
+
# @since 0.2.0
|
28
|
+
###
|
29
|
+
module Fileable
|
30
|
+
def self.included(mod)
|
31
|
+
mod.extend ClassMethods
|
32
|
+
end
|
33
|
+
|
34
|
+
def save_file(file,mode: 'wt',**kargs)
|
35
|
+
File.open(file,mode: mode,**kargs) do |file|
|
36
|
+
file.write(to_s())
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
module ClassMethods
|
41
|
+
def load_file(file,mode: 'rt:BOM|UTF-8',**kargs)
|
42
|
+
data = File.read(file,mode: mode,**kargs)
|
43
|
+
|
44
|
+
return load_data(data,file: file,**kargs)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'nhkore/util'
|
25
|
+
|
26
|
+
|
27
|
+
module NHKore
|
28
|
+
###
|
29
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
30
|
+
# @since 0.2.0
|
31
|
+
###
|
32
|
+
class Missingno
|
33
|
+
attr_reader :kanas
|
34
|
+
attr_reader :kanjis
|
35
|
+
|
36
|
+
# @param data [News,Article,Array<Word>]
|
37
|
+
def initialize(data)
|
38
|
+
super()
|
39
|
+
|
40
|
+
@kanas = {}
|
41
|
+
@kanjis = {}
|
42
|
+
|
43
|
+
# News?
|
44
|
+
if data.respond_to?(:articles)
|
45
|
+
add_news(data)
|
46
|
+
# Article?
|
47
|
+
elsif data.respond_to?(:words)
|
48
|
+
add_article(data)
|
49
|
+
else
|
50
|
+
add_words(data)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def add_article(article)
|
55
|
+
add_words(article.words.values())
|
56
|
+
end
|
57
|
+
|
58
|
+
def add_news(news)
|
59
|
+
news.articles.values.each() do |article|
|
60
|
+
add_article(article)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def add_words(words)
|
65
|
+
words.each() do |word|
|
66
|
+
# We only want ones that are both filled in because
|
67
|
+
# Word.scrape_ruby_tag() will raise an error if either is empty.
|
68
|
+
next if Util.empty_web_str?(word.kana) || Util.empty_web_str?(word.kanji)
|
69
|
+
|
70
|
+
if !kanas.key?(word.kana)
|
71
|
+
kanas[word.kana] = word
|
72
|
+
end
|
73
|
+
|
74
|
+
if !kanjis.key?(word.kanji)
|
75
|
+
kanjis[word.kanji] = word
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def kana_from_kanji(kanji)
|
81
|
+
word = @kanjis[kanji]
|
82
|
+
|
83
|
+
return word.nil?() ? nil : word.kana
|
84
|
+
end
|
85
|
+
|
86
|
+
def kanji_from_kana(kana)
|
87
|
+
word = @kanas[kana]
|
88
|
+
|
89
|
+
return word.nil?() ? nil : word.kanji
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
data/lib/nhkore/news.rb
ADDED
@@ -0,0 +1,176 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'nhkore/article'
|
25
|
+
require 'nhkore/error'
|
26
|
+
require 'nhkore/fileable'
|
27
|
+
require 'nhkore/util'
|
28
|
+
|
29
|
+
|
30
|
+
module NHKore
|
31
|
+
###
|
32
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
33
|
+
# @since 0.2.0
|
34
|
+
###
|
35
|
+
class News
|
36
|
+
include Fileable
|
37
|
+
|
38
|
+
DEFAULT_DIR = Util::CORE_DIR
|
39
|
+
FAVORED_URL = /https\:/i
|
40
|
+
|
41
|
+
attr_reader :articles
|
42
|
+
attr_reader :sha256s
|
43
|
+
|
44
|
+
def initialize()
|
45
|
+
super()
|
46
|
+
|
47
|
+
@articles = {}
|
48
|
+
@sha256s = {}
|
49
|
+
end
|
50
|
+
|
51
|
+
def add_article(article,key: nil,overwrite: false)
|
52
|
+
key = article.url if key.nil?()
|
53
|
+
|
54
|
+
if !overwrite
|
55
|
+
raise ArgumentError,"duplicate article[#{key}] in articles" if @articles.key?(key)
|
56
|
+
raise ArgumentError,"duplicate sha256[#{article.sha256}] in articles" if @sha256s.key?(article.sha256)
|
57
|
+
end
|
58
|
+
|
59
|
+
@articles[key] = article
|
60
|
+
@sha256s[article.sha256] = article.url
|
61
|
+
|
62
|
+
return self
|
63
|
+
end
|
64
|
+
|
65
|
+
def self.build_file(filename)
|
66
|
+
return File.join(DEFAULT_DIR,filename)
|
67
|
+
end
|
68
|
+
|
69
|
+
def encode_with(coder)
|
70
|
+
# Order matters.
|
71
|
+
# Don't output @sha256s.
|
72
|
+
|
73
|
+
coder[:articles] = @articles
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.load_data(data,article_class: Article,file: nil,news_class: News,**kargs)
|
77
|
+
data = Util.load_yaml(data,file: file)
|
78
|
+
|
79
|
+
articles = data[:articles]
|
80
|
+
|
81
|
+
news = news_class.new()
|
82
|
+
|
83
|
+
if !articles.nil?()
|
84
|
+
articles.each() do |key,hash|
|
85
|
+
key = key.to_s() # Change from a symbol
|
86
|
+
news.add_article(article_class.load_data(key,hash),key: key)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
return news
|
91
|
+
end
|
92
|
+
|
93
|
+
def update_article(article,url)
|
94
|
+
# Favor https.
|
95
|
+
return if article.url =~ FAVORED_URL
|
96
|
+
return if url !~ FAVORED_URL
|
97
|
+
|
98
|
+
@articles.delete(article.url)
|
99
|
+
@articles[url] = article
|
100
|
+
article.url = url
|
101
|
+
end
|
102
|
+
|
103
|
+
def article(key)
|
104
|
+
return @articles[key]
|
105
|
+
end
|
106
|
+
|
107
|
+
def article_with_sha256(sha256)
|
108
|
+
article = nil
|
109
|
+
|
110
|
+
@articles.values().each() do |a|
|
111
|
+
if a.sha256 == sha256
|
112
|
+
article = a
|
113
|
+
|
114
|
+
break
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
return article
|
119
|
+
end
|
120
|
+
|
121
|
+
def article?(key)
|
122
|
+
return @articles.key?(key)
|
123
|
+
end
|
124
|
+
|
125
|
+
def sha256?(sha256)
|
126
|
+
return @sha256s.key?(sha256)
|
127
|
+
end
|
128
|
+
|
129
|
+
def to_s()
|
130
|
+
# Put each Word on one line (flow/inline style).
|
131
|
+
return Util.dump_yaml(self,flow_level: 8)
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
###
|
136
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
137
|
+
# @since 0.2.0
|
138
|
+
###
|
139
|
+
class FutsuuNews < News
|
140
|
+
DEFAULT_FILENAME = 'nhk_news_web_regular.yml'
|
141
|
+
DEFAULT_FILE = build_file(DEFAULT_FILENAME)
|
142
|
+
|
143
|
+
def self.load_data(data,**kargs)
|
144
|
+
return News.load_data(data,article_class: Article,news_class: FutsuuNews,**kargs)
|
145
|
+
end
|
146
|
+
|
147
|
+
def self.load_file(file=DEFAULT_FILE,**kargs)
|
148
|
+
return News.load_file(file,article_class: Article,news_class: FutsuuNews,**kargs)
|
149
|
+
end
|
150
|
+
|
151
|
+
def save_file(file=DEFAULT_FILE,**kargs)
|
152
|
+
super(file,**kargs)
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
###
|
157
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
158
|
+
# @since 0.2.0
|
159
|
+
###
|
160
|
+
class YasashiiNews < News
|
161
|
+
DEFAULT_FILENAME = 'nhk_news_web_easy.yml'
|
162
|
+
DEFAULT_FILE = build_file(DEFAULT_FILENAME)
|
163
|
+
|
164
|
+
def self.load_data(data,**kargs)
|
165
|
+
return News.load_data(data,article_class: Article,news_class: YasashiiNews,**kargs)
|
166
|
+
end
|
167
|
+
|
168
|
+
def self.load_file(file=DEFAULT_FILE,**kargs)
|
169
|
+
return News.load_file(file,article_class: Article,news_class: YasashiiNews,**kargs)
|
170
|
+
end
|
171
|
+
|
172
|
+
def save_file(file=DEFAULT_FILE,**kargs)
|
173
|
+
super(file,**kargs)
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'nhkore/word'
|
25
|
+
|
26
|
+
|
27
|
+
module NHKore
|
28
|
+
###
|
29
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
30
|
+
# @since 0.2.0
|
31
|
+
###
|
32
|
+
class Polisher
|
33
|
+
def begin_polish(str)
|
34
|
+
return str
|
35
|
+
end
|
36
|
+
|
37
|
+
def polish(str)
|
38
|
+
str = begin_polish(str)
|
39
|
+
str = end_polish(str)
|
40
|
+
|
41
|
+
return str
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.polish_any(obj,polishers)
|
45
|
+
return nil if obj.nil?()
|
46
|
+
|
47
|
+
polishers = Array(polishers)
|
48
|
+
|
49
|
+
return obj if polishers.empty?()
|
50
|
+
|
51
|
+
if obj.is_a?(Word)
|
52
|
+
obj = Word.new(
|
53
|
+
kana: polish_any(obj.kana,polishers),
|
54
|
+
kanji: polish_any(obj.kanji,polishers),
|
55
|
+
word: obj
|
56
|
+
)
|
57
|
+
else # String
|
58
|
+
polishers.each() do |polisher|
|
59
|
+
obj = polisher.polish(obj)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
return obj
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
###
|
68
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
69
|
+
# @since 0.2.0
|
70
|
+
###
|
71
|
+
class BasicPolisher < Polisher
|
72
|
+
def end_polish(str)
|
73
|
+
# Keep Japanese dots in names:
|
74
|
+
# - Yunibaasaru・Sutajio・Japan
|
75
|
+
# Keep numbers next to kanji/kana, else the below kana won't make sense:
|
76
|
+
# - Word { kanji: 20日, kana: はつか }
|
77
|
+
|
78
|
+
str = str.gsub(/[^[[:alnum:]]・]/,'')
|
79
|
+
|
80
|
+
# Numbers/dots by themselves (without kanji/kana) should be ignored (empty).
|
81
|
+
str = '' if str.gsub(/[[[:digit:]]・]+/,'').empty?()
|
82
|
+
|
83
|
+
return str
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
###
|
88
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
89
|
+
# @since 0.2.0
|
90
|
+
###
|
91
|
+
class BestPolisher < BasicPolisher
|
92
|
+
end
|
93
|
+
end
|
@@ -0,0 +1,137 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'nokogiri'
|
25
|
+
require 'open-uri'
|
26
|
+
|
27
|
+
require 'nhkore/util'
|
28
|
+
|
29
|
+
|
30
|
+
module NHKore
|
31
|
+
###
|
32
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
33
|
+
# @since 0.2.0
|
34
|
+
###
|
35
|
+
class Scraper
|
36
|
+
# Copied from googler (https://github.com/jarun/googler).
|
37
|
+
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
|
38
|
+
|
39
|
+
attr_accessor :is_file
|
40
|
+
attr_accessor :str_or_io
|
41
|
+
attr_accessor :url
|
42
|
+
|
43
|
+
alias_method :is_file?,:is_file
|
44
|
+
|
45
|
+
# +max_redirects+ defaults to 3 for safety (infinite-loop attack).
|
46
|
+
#
|
47
|
+
# All URL options: https://ruby-doc.org/stdlib-2.7.0/libdoc/open-uri/rdoc/OpenURI/OpenRead.html
|
48
|
+
#
|
49
|
+
# Pass in +header: {}+ for the default HTTP header fields to be set.
|
50
|
+
#
|
51
|
+
# @param redirect_rule [nil,:lenient,:strict]
|
52
|
+
def initialize(url,header: nil,is_file: false,max_redirects: 3,max_retries: 3,redirect_rule: :strict,str_or_io: nil,**kargs)
|
53
|
+
super()
|
54
|
+
|
55
|
+
@is_file = is_file
|
56
|
+
@url = url
|
57
|
+
|
58
|
+
if !header.nil?() && !is_file
|
59
|
+
# Some sites (Search Engines) hate scrapers, so need HTTP header fields.
|
60
|
+
# If this isn't enough, look at googler for more header fields to set:
|
61
|
+
# - https://github.com/jarun/googler
|
62
|
+
# If necessary, can use Faraday, HTTParty, or RestClient gem and
|
63
|
+
# pass in to str_or_io.
|
64
|
+
|
65
|
+
header['User-Agent'] = USER_AGENT unless header.key?('User-Agent')
|
66
|
+
|
67
|
+
kargs.merge!(header) # header will overwrite duplicate kargs entries
|
68
|
+
end
|
69
|
+
|
70
|
+
if str_or_io.nil?()
|
71
|
+
if is_file
|
72
|
+
# NHK's website tends to always use UTF-8.
|
73
|
+
@str_or_io = File.open(url,'rt:UTF-8',**kargs)
|
74
|
+
else
|
75
|
+
max_redirects = 10000 if max_redirects.nil?() || max_redirects < 0
|
76
|
+
|
77
|
+
top_uri = URI(url)
|
78
|
+
top_domain = Util.domain(top_uri.host)
|
79
|
+
|
80
|
+
begin
|
81
|
+
# Use URI.open() instead of (Kernel.)open() for safety (code-injection attack).
|
82
|
+
@str_or_io = URI.open(url,redirect: false,**kargs)
|
83
|
+
@url = url
|
84
|
+
rescue OpenURI::HTTPError => e
|
85
|
+
raise e.exception("HTTP error[#{e.to_s()}] at URL[#{url}]")
|
86
|
+
rescue OpenURI::HTTPRedirect => redirect
|
87
|
+
redirect_uri = redirect.uri
|
88
|
+
|
89
|
+
if (max_redirects -= 1) < 0
|
90
|
+
raise redirect.exception("redirected to URL[#{redirect_uri}]: #{redirect}")
|
91
|
+
end
|
92
|
+
|
93
|
+
case redirect_rule
|
94
|
+
when :lenient,:strict
|
95
|
+
if redirect_uri.scheme != top_uri.scheme
|
96
|
+
raise redirect.exception("redirect scheme[#{redirect_uri.scheme}] does not match original " +
|
97
|
+
"scheme[#{top_uri.scheme}] at redirect URL[#{redirect_uri}]: #{redirect}")
|
98
|
+
end
|
99
|
+
|
100
|
+
if redirect_rule == :strict
|
101
|
+
redirect_domain = Util.domain(redirect_uri.host)
|
102
|
+
|
103
|
+
if redirect_domain != top_domain
|
104
|
+
raise redirect.exception("redirect domain[#{redirect_domain}] does not match original " +
|
105
|
+
"domain[#{top_domain}] at redirect URL[#{redirect_uri}]: #{redirect}")
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
url = redirect_uri
|
111
|
+
|
112
|
+
retry
|
113
|
+
rescue SocketError
|
114
|
+
raise if max_retries.nil?() || (max_retries -= 1) < 0
|
115
|
+
|
116
|
+
retry
|
117
|
+
end
|
118
|
+
end
|
119
|
+
else
|
120
|
+
@str_or_io = str_or_io
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def html_doc()
|
125
|
+
return Nokogiri::HTML(@str_or_io)
|
126
|
+
end
|
127
|
+
|
128
|
+
def join_url(relative_url)
|
129
|
+
# For a file, don't know what to do.
|
130
|
+
# It would be unsafe to return something else;
|
131
|
+
# for example, it could return a lot of "../../../" to your root dir.
|
132
|
+
return nil if @is_file
|
133
|
+
|
134
|
+
return URI::join(@url,relative_url)
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|