nhkore 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +37 -1
- data/README.md +18 -6
- data/Rakefile +11 -16
- data/bin/nhkore +1 -3
- data/lib/nhkore/app.rb +616 -0
- data/lib/nhkore/article.rb +130 -0
- data/lib/nhkore/article_scraper.rb +653 -0
- data/lib/nhkore/cleaner.rb +91 -0
- data/lib/nhkore/cli/bing_cmd.rb +220 -0
- data/lib/nhkore/cli/fx_cmd.rb +116 -0
- data/lib/nhkore/cli/get_cmd.rb +153 -0
- data/lib/nhkore/cli/news_cmd.rb +375 -0
- data/lib/nhkore/cli/sift_cmd.rb +382 -0
- data/lib/nhkore/defn.rb +104 -0
- data/lib/nhkore/dict.rb +80 -0
- data/lib/nhkore/dict_scraper.rb +76 -0
- data/lib/nhkore/entry.rb +104 -0
- data/lib/nhkore/error.rb +35 -0
- data/lib/nhkore/fileable.rb +48 -0
- data/lib/nhkore/missingno.rb +92 -0
- data/lib/nhkore/news.rb +176 -0
- data/lib/nhkore/polisher.rb +93 -0
- data/lib/nhkore/scraper.rb +137 -0
- data/lib/nhkore/search_link.rb +188 -0
- data/lib/nhkore/search_scraper.rb +152 -0
- data/lib/nhkore/sifter.rb +339 -0
- data/lib/nhkore/splitter.rb +90 -0
- data/lib/nhkore/util.rb +190 -0
- data/lib/nhkore/variator.rb +87 -0
- data/lib/nhkore/version.rb +1 -1
- data/lib/nhkore/word.rb +134 -17
- data/lib/nhkore.rb +39 -40
- data/nhkore.gemspec +23 -8
- data/test/{nhkore_tester.rb → nhkore/test_helper.rb} +3 -1
- data/test/nhkore_test.rb +8 -6
- metadata +204 -11
data/lib/nhkore/entry.rb
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'nhkore/defn'
|
25
|
+
require 'nhkore/util'
|
26
|
+
|
27
|
+
|
28
|
+
module NHKore
|
29
|
+
###
|
30
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
31
|
+
# @since 0.2.0
|
32
|
+
###
|
33
|
+
class Entry
|
34
|
+
HYOUKI_SEP = '・'
|
35
|
+
|
36
|
+
attr_reader :defns
|
37
|
+
attr_accessor :id
|
38
|
+
|
39
|
+
def initialize()
|
40
|
+
super()
|
41
|
+
|
42
|
+
@defns = []
|
43
|
+
@id = nil
|
44
|
+
end
|
45
|
+
|
46
|
+
def build_defn()
|
47
|
+
defns = []
|
48
|
+
i = 0
|
49
|
+
|
50
|
+
@defns.each() do |defn|
|
51
|
+
defns << "#{i += 1})#{defn}" # Japanese parenthesis
|
52
|
+
end
|
53
|
+
|
54
|
+
return defns.join("\n")
|
55
|
+
end
|
56
|
+
|
57
|
+
def build_hyouki()
|
58
|
+
# Since Ruby v1.9, Hash preserves order.
|
59
|
+
# Ruby v2.7 doc for Set still says no guarantee of order, so don't use.
|
60
|
+
hyoukis = {}
|
61
|
+
|
62
|
+
@defns.each() do |defn|
|
63
|
+
defn.hyoukis.each() do |hyouki|
|
64
|
+
hyouki = hyouki.chomp(HYOUKI_SEP)
|
65
|
+
|
66
|
+
next if hyouki.empty?()
|
67
|
+
|
68
|
+
hyoukis[hyouki] = true
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
return hyoukis.keys.join(HYOUKI_SEP)
|
73
|
+
end
|
74
|
+
|
75
|
+
def self.scrape(id,array,missingno: nil,url: nil)
|
76
|
+
entry = Entry.new()
|
77
|
+
|
78
|
+
entry.id = Util.unspace_web_str(id.to_s()).downcase()
|
79
|
+
|
80
|
+
return nil if entry.id.empty?()
|
81
|
+
|
82
|
+
array.each() do |hash|
|
83
|
+
defn = Defn.scrape(hash,missingno: missingno,url: url)
|
84
|
+
entry.defns << defn unless defn.nil?()
|
85
|
+
end
|
86
|
+
|
87
|
+
return nil if entry.defns.empty?()
|
88
|
+
return entry
|
89
|
+
end
|
90
|
+
|
91
|
+
def to_s()
|
92
|
+
s = ''.dup()
|
93
|
+
|
94
|
+
return s if @defns.empty?()
|
95
|
+
|
96
|
+
hyouki = build_hyouki()
|
97
|
+
|
98
|
+
s << "#{hyouki}\n" unless Util.empty_web_str?(hyouki)
|
99
|
+
s << build_defn()
|
100
|
+
|
101
|
+
return s
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
data/lib/nhkore/error.rb
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
module NHKore
|
25
|
+
###
|
26
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
27
|
+
# @since 0.2.0
|
28
|
+
###
|
29
|
+
class Error < ::StandardError; end
|
30
|
+
|
31
|
+
class CLIError < Error; end # @since 0.2.0
|
32
|
+
class ParseError < Error; end # @since 0.2.0
|
33
|
+
class ScrapeError < Error; end # @since 0.2.0
|
34
|
+
class ZipError < Error; end # @since 0.2.0
|
35
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
module NHKore
|
25
|
+
###
|
26
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
27
|
+
# @since 0.2.0
|
28
|
+
###
|
29
|
+
module Fileable
|
30
|
+
def self.included(mod)
|
31
|
+
mod.extend ClassMethods
|
32
|
+
end
|
33
|
+
|
34
|
+
def save_file(file,mode: 'wt',**kargs)
|
35
|
+
File.open(file,mode: mode,**kargs) do |file|
|
36
|
+
file.write(to_s())
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
module ClassMethods
|
41
|
+
def load_file(file,mode: 'rt:BOM|UTF-8',**kargs)
|
42
|
+
data = File.read(file,mode: mode,**kargs)
|
43
|
+
|
44
|
+
return load_data(data,file: file,**kargs)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'nhkore/util'
|
25
|
+
|
26
|
+
|
27
|
+
module NHKore
|
28
|
+
###
|
29
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
30
|
+
# @since 0.2.0
|
31
|
+
###
|
32
|
+
class Missingno
|
33
|
+
attr_reader :kanas
|
34
|
+
attr_reader :kanjis
|
35
|
+
|
36
|
+
# @param data [News,Article,Array<Word>]
|
37
|
+
def initialize(data)
|
38
|
+
super()
|
39
|
+
|
40
|
+
@kanas = {}
|
41
|
+
@kanjis = {}
|
42
|
+
|
43
|
+
# News?
|
44
|
+
if data.respond_to?(:articles)
|
45
|
+
add_news(data)
|
46
|
+
# Article?
|
47
|
+
elsif data.respond_to?(:words)
|
48
|
+
add_article(data)
|
49
|
+
else
|
50
|
+
add_words(data)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def add_article(article)
|
55
|
+
add_words(article.words.values())
|
56
|
+
end
|
57
|
+
|
58
|
+
def add_news(news)
|
59
|
+
news.articles.values.each() do |article|
|
60
|
+
add_article(article)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def add_words(words)
|
65
|
+
words.each() do |word|
|
66
|
+
# We only want ones that are both filled in because
|
67
|
+
# Word.scrape_ruby_tag() will raise an error if either is empty.
|
68
|
+
next if Util.empty_web_str?(word.kana) || Util.empty_web_str?(word.kanji)
|
69
|
+
|
70
|
+
if !kanas.key?(word.kana)
|
71
|
+
kanas[word.kana] = word
|
72
|
+
end
|
73
|
+
|
74
|
+
if !kanjis.key?(word.kanji)
|
75
|
+
kanjis[word.kanji] = word
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def kana_from_kanji(kanji)
|
81
|
+
word = @kanjis[kanji]
|
82
|
+
|
83
|
+
return word.nil?() ? nil : word.kana
|
84
|
+
end
|
85
|
+
|
86
|
+
def kanji_from_kana(kana)
|
87
|
+
word = @kanas[kana]
|
88
|
+
|
89
|
+
return word.nil?() ? nil : word.kanji
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
data/lib/nhkore/news.rb
ADDED
@@ -0,0 +1,176 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'nhkore/article'
|
25
|
+
require 'nhkore/error'
|
26
|
+
require 'nhkore/fileable'
|
27
|
+
require 'nhkore/util'
|
28
|
+
|
29
|
+
|
30
|
+
module NHKore
|
31
|
+
###
|
32
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
33
|
+
# @since 0.2.0
|
34
|
+
###
|
35
|
+
class News
|
36
|
+
include Fileable
|
37
|
+
|
38
|
+
DEFAULT_DIR = Util::CORE_DIR
|
39
|
+
FAVORED_URL = /https\:/i
|
40
|
+
|
41
|
+
attr_reader :articles
|
42
|
+
attr_reader :sha256s
|
43
|
+
|
44
|
+
def initialize()
|
45
|
+
super()
|
46
|
+
|
47
|
+
@articles = {}
|
48
|
+
@sha256s = {}
|
49
|
+
end
|
50
|
+
|
51
|
+
def add_article(article,key: nil,overwrite: false)
|
52
|
+
key = article.url if key.nil?()
|
53
|
+
|
54
|
+
if !overwrite
|
55
|
+
raise ArgumentError,"duplicate article[#{key}] in articles" if @articles.key?(key)
|
56
|
+
raise ArgumentError,"duplicate sha256[#{article.sha256}] in articles" if @sha256s.key?(article.sha256)
|
57
|
+
end
|
58
|
+
|
59
|
+
@articles[key] = article
|
60
|
+
@sha256s[article.sha256] = article.url
|
61
|
+
|
62
|
+
return self
|
63
|
+
end
|
64
|
+
|
65
|
+
def self.build_file(filename)
|
66
|
+
return File.join(DEFAULT_DIR,filename)
|
67
|
+
end
|
68
|
+
|
69
|
+
def encode_with(coder)
|
70
|
+
# Order matters.
|
71
|
+
# Don't output @sha256s.
|
72
|
+
|
73
|
+
coder[:articles] = @articles
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.load_data(data,article_class: Article,file: nil,news_class: News,**kargs)
|
77
|
+
data = Util.load_yaml(data,file: file)
|
78
|
+
|
79
|
+
articles = data[:articles]
|
80
|
+
|
81
|
+
news = news_class.new()
|
82
|
+
|
83
|
+
if !articles.nil?()
|
84
|
+
articles.each() do |key,hash|
|
85
|
+
key = key.to_s() # Change from a symbol
|
86
|
+
news.add_article(article_class.load_data(key,hash),key: key)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
return news
|
91
|
+
end
|
92
|
+
|
93
|
+
def update_article(article,url)
|
94
|
+
# Favor https.
|
95
|
+
return if article.url =~ FAVORED_URL
|
96
|
+
return if url !~ FAVORED_URL
|
97
|
+
|
98
|
+
@articles.delete(article.url)
|
99
|
+
@articles[url] = article
|
100
|
+
article.url = url
|
101
|
+
end
|
102
|
+
|
103
|
+
def article(key)
|
104
|
+
return @articles[key]
|
105
|
+
end
|
106
|
+
|
107
|
+
def article_with_sha256(sha256)
|
108
|
+
article = nil
|
109
|
+
|
110
|
+
@articles.values().each() do |a|
|
111
|
+
if a.sha256 == sha256
|
112
|
+
article = a
|
113
|
+
|
114
|
+
break
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
return article
|
119
|
+
end
|
120
|
+
|
121
|
+
def article?(key)
|
122
|
+
return @articles.key?(key)
|
123
|
+
end
|
124
|
+
|
125
|
+
def sha256?(sha256)
|
126
|
+
return @sha256s.key?(sha256)
|
127
|
+
end
|
128
|
+
|
129
|
+
def to_s()
|
130
|
+
# Put each Word on one line (flow/inline style).
|
131
|
+
return Util.dump_yaml(self,flow_level: 8)
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
###
|
136
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
137
|
+
# @since 0.2.0
|
138
|
+
###
|
139
|
+
class FutsuuNews < News
|
140
|
+
DEFAULT_FILENAME = 'nhk_news_web_regular.yml'
|
141
|
+
DEFAULT_FILE = build_file(DEFAULT_FILENAME)
|
142
|
+
|
143
|
+
def self.load_data(data,**kargs)
|
144
|
+
return News.load_data(data,article_class: Article,news_class: FutsuuNews,**kargs)
|
145
|
+
end
|
146
|
+
|
147
|
+
def self.load_file(file=DEFAULT_FILE,**kargs)
|
148
|
+
return News.load_file(file,article_class: Article,news_class: FutsuuNews,**kargs)
|
149
|
+
end
|
150
|
+
|
151
|
+
def save_file(file=DEFAULT_FILE,**kargs)
|
152
|
+
super(file,**kargs)
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
###
|
157
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
158
|
+
# @since 0.2.0
|
159
|
+
###
|
160
|
+
class YasashiiNews < News
|
161
|
+
DEFAULT_FILENAME = 'nhk_news_web_easy.yml'
|
162
|
+
DEFAULT_FILE = build_file(DEFAULT_FILENAME)
|
163
|
+
|
164
|
+
def self.load_data(data,**kargs)
|
165
|
+
return News.load_data(data,article_class: Article,news_class: YasashiiNews,**kargs)
|
166
|
+
end
|
167
|
+
|
168
|
+
def self.load_file(file=DEFAULT_FILE,**kargs)
|
169
|
+
return News.load_file(file,article_class: Article,news_class: YasashiiNews,**kargs)
|
170
|
+
end
|
171
|
+
|
172
|
+
def save_file(file=DEFAULT_FILE,**kargs)
|
173
|
+
super(file,**kargs)
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'nhkore/word'
|
25
|
+
|
26
|
+
|
27
|
+
module NHKore
|
28
|
+
###
|
29
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
30
|
+
# @since 0.2.0
|
31
|
+
###
|
32
|
+
class Polisher
|
33
|
+
def begin_polish(str)
|
34
|
+
return str
|
35
|
+
end
|
36
|
+
|
37
|
+
def polish(str)
|
38
|
+
str = begin_polish(str)
|
39
|
+
str = end_polish(str)
|
40
|
+
|
41
|
+
return str
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.polish_any(obj,polishers)
|
45
|
+
return nil if obj.nil?()
|
46
|
+
|
47
|
+
polishers = Array(polishers)
|
48
|
+
|
49
|
+
return obj if polishers.empty?()
|
50
|
+
|
51
|
+
if obj.is_a?(Word)
|
52
|
+
obj = Word.new(
|
53
|
+
kana: polish_any(obj.kana,polishers),
|
54
|
+
kanji: polish_any(obj.kanji,polishers),
|
55
|
+
word: obj
|
56
|
+
)
|
57
|
+
else # String
|
58
|
+
polishers.each() do |polisher|
|
59
|
+
obj = polisher.polish(obj)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
return obj
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
###
|
68
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
69
|
+
# @since 0.2.0
|
70
|
+
###
|
71
|
+
class BasicPolisher < Polisher
|
72
|
+
def end_polish(str)
|
73
|
+
# Keep Japanese dots in names:
|
74
|
+
# - Yunibaasaru・Sutajio・Japan
|
75
|
+
# Keep numbers next to kanji/kana, else the below kana won't make sense:
|
76
|
+
# - Word { kanji: 20日, kana: はつか }
|
77
|
+
|
78
|
+
str = str.gsub(/[^[[:alnum:]]・]/,'')
|
79
|
+
|
80
|
+
# Numbers/dots by themselves (without kanji/kana) should be ignored (empty).
|
81
|
+
str = '' if str.gsub(/[[[:digit:]]・]+/,'').empty?()
|
82
|
+
|
83
|
+
return str
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
###
|
88
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
89
|
+
# @since 0.2.0
|
90
|
+
###
|
91
|
+
class BestPolisher < BasicPolisher
|
92
|
+
end
|
93
|
+
end
|
@@ -0,0 +1,137 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'nokogiri'
|
25
|
+
require 'open-uri'
|
26
|
+
|
27
|
+
require 'nhkore/util'
|
28
|
+
|
29
|
+
|
30
|
+
module NHKore
|
31
|
+
###
|
32
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
33
|
+
# @since 0.2.0
|
34
|
+
###
|
35
|
+
class Scraper
|
36
|
+
# Copied from googler (https://github.com/jarun/googler).
|
37
|
+
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
|
38
|
+
|
39
|
+
attr_accessor :is_file
|
40
|
+
attr_accessor :str_or_io
|
41
|
+
attr_accessor :url
|
42
|
+
|
43
|
+
alias_method :is_file?,:is_file
|
44
|
+
|
45
|
+
# +max_redirects+ defaults to 3 for safety (infinite-loop attack).
|
46
|
+
#
|
47
|
+
# All URL options: https://ruby-doc.org/stdlib-2.7.0/libdoc/open-uri/rdoc/OpenURI/OpenRead.html
|
48
|
+
#
|
49
|
+
# Pass in +header: {}+ for the default HTTP header fields to be set.
|
50
|
+
#
|
51
|
+
# @param redirect_rule [nil,:lenient,:strict]
|
52
|
+
def initialize(url,header: nil,is_file: false,max_redirects: 3,max_retries: 3,redirect_rule: :strict,str_or_io: nil,**kargs)
|
53
|
+
super()
|
54
|
+
|
55
|
+
@is_file = is_file
|
56
|
+
@url = url
|
57
|
+
|
58
|
+
if !header.nil?() && !is_file
|
59
|
+
# Some sites (Search Engines) hate scrapers, so need HTTP header fields.
|
60
|
+
# If this isn't enough, look at googler for more header fields to set:
|
61
|
+
# - https://github.com/jarun/googler
|
62
|
+
# If necessary, can use Faraday, HTTParty, or RestClient gem and
|
63
|
+
# pass in to str_or_io.
|
64
|
+
|
65
|
+
header['User-Agent'] = USER_AGENT unless header.key?('User-Agent')
|
66
|
+
|
67
|
+
kargs.merge!(header) # header will overwrite duplicate kargs entries
|
68
|
+
end
|
69
|
+
|
70
|
+
if str_or_io.nil?()
|
71
|
+
if is_file
|
72
|
+
# NHK's website tends to always use UTF-8.
|
73
|
+
@str_or_io = File.open(url,'rt:UTF-8',**kargs)
|
74
|
+
else
|
75
|
+
max_redirects = 10000 if max_redirects.nil?() || max_redirects < 0
|
76
|
+
|
77
|
+
top_uri = URI(url)
|
78
|
+
top_domain = Util.domain(top_uri.host)
|
79
|
+
|
80
|
+
begin
|
81
|
+
# Use URI.open() instead of (Kernel.)open() for safety (code-injection attack).
|
82
|
+
@str_or_io = URI.open(url,redirect: false,**kargs)
|
83
|
+
@url = url
|
84
|
+
rescue OpenURI::HTTPError => e
|
85
|
+
raise e.exception("HTTP error[#{e.to_s()}] at URL[#{url}]")
|
86
|
+
rescue OpenURI::HTTPRedirect => redirect
|
87
|
+
redirect_uri = redirect.uri
|
88
|
+
|
89
|
+
if (max_redirects -= 1) < 0
|
90
|
+
raise redirect.exception("redirected to URL[#{redirect_uri}]: #{redirect}")
|
91
|
+
end
|
92
|
+
|
93
|
+
case redirect_rule
|
94
|
+
when :lenient,:strict
|
95
|
+
if redirect_uri.scheme != top_uri.scheme
|
96
|
+
raise redirect.exception("redirect scheme[#{redirect_uri.scheme}] does not match original " +
|
97
|
+
"scheme[#{top_uri.scheme}] at redirect URL[#{redirect_uri}]: #{redirect}")
|
98
|
+
end
|
99
|
+
|
100
|
+
if redirect_rule == :strict
|
101
|
+
redirect_domain = Util.domain(redirect_uri.host)
|
102
|
+
|
103
|
+
if redirect_domain != top_domain
|
104
|
+
raise redirect.exception("redirect domain[#{redirect_domain}] does not match original " +
|
105
|
+
"domain[#{top_domain}] at redirect URL[#{redirect_uri}]: #{redirect}")
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
url = redirect_uri
|
111
|
+
|
112
|
+
retry
|
113
|
+
rescue SocketError
|
114
|
+
raise if max_retries.nil?() || (max_retries -= 1) < 0
|
115
|
+
|
116
|
+
retry
|
117
|
+
end
|
118
|
+
end
|
119
|
+
else
|
120
|
+
@str_or_io = str_or_io
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def html_doc()
|
125
|
+
return Nokogiri::HTML(@str_or_io)
|
126
|
+
end
|
127
|
+
|
128
|
+
def join_url(relative_url)
|
129
|
+
# For a file, don't know what to do.
|
130
|
+
# It would be unsafe to return something else;
|
131
|
+
# for example, it could return a lot of "../../../" to your root dir.
|
132
|
+
return nil if @is_file
|
133
|
+
|
134
|
+
return URI::join(@url,relative_url)
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|