nhkore 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +37 -1
- data/README.md +18 -6
- data/Rakefile +11 -16
- data/bin/nhkore +1 -3
- data/lib/nhkore/app.rb +616 -0
- data/lib/nhkore/article.rb +130 -0
- data/lib/nhkore/article_scraper.rb +653 -0
- data/lib/nhkore/cleaner.rb +91 -0
- data/lib/nhkore/cli/bing_cmd.rb +220 -0
- data/lib/nhkore/cli/fx_cmd.rb +116 -0
- data/lib/nhkore/cli/get_cmd.rb +153 -0
- data/lib/nhkore/cli/news_cmd.rb +375 -0
- data/lib/nhkore/cli/sift_cmd.rb +382 -0
- data/lib/nhkore/defn.rb +104 -0
- data/lib/nhkore/dict.rb +80 -0
- data/lib/nhkore/dict_scraper.rb +76 -0
- data/lib/nhkore/entry.rb +104 -0
- data/lib/nhkore/error.rb +35 -0
- data/lib/nhkore/fileable.rb +48 -0
- data/lib/nhkore/missingno.rb +92 -0
- data/lib/nhkore/news.rb +176 -0
- data/lib/nhkore/polisher.rb +93 -0
- data/lib/nhkore/scraper.rb +137 -0
- data/lib/nhkore/search_link.rb +188 -0
- data/lib/nhkore/search_scraper.rb +152 -0
- data/lib/nhkore/sifter.rb +339 -0
- data/lib/nhkore/splitter.rb +90 -0
- data/lib/nhkore/util.rb +190 -0
- data/lib/nhkore/variator.rb +87 -0
- data/lib/nhkore/version.rb +1 -1
- data/lib/nhkore/word.rb +134 -17
- data/lib/nhkore.rb +39 -40
- data/nhkore.gemspec +23 -8
- data/test/{nhkore_tester.rb → nhkore/test_helper.rb} +3 -1
- data/test/nhkore_test.rb +8 -6
- metadata +204 -11
@@ -0,0 +1,130 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'time'
|
25
|
+
|
26
|
+
require 'nhkore/util'
|
27
|
+
require 'nhkore/word'
|
28
|
+
|
29
|
+
|
30
|
+
module NHKore
|
31
|
+
###
|
32
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
33
|
+
# @since 0.2.0
|
34
|
+
###
|
35
|
+
class Article
|
36
|
+
attr_accessor :datetime
|
37
|
+
attr_accessor :futsuurl
|
38
|
+
attr_accessor :sha256
|
39
|
+
attr_accessor :title
|
40
|
+
attr_accessor :url
|
41
|
+
attr_reader :words
|
42
|
+
|
43
|
+
def initialize()
|
44
|
+
super()
|
45
|
+
|
46
|
+
@datetime = nil
|
47
|
+
@futsuurl = nil
|
48
|
+
@sha256 = nil
|
49
|
+
@title = nil
|
50
|
+
@url = nil
|
51
|
+
@words = {}
|
52
|
+
end
|
53
|
+
|
54
|
+
# Why does this not look up the kanji/kana only and then update the other
|
55
|
+
# kana/kanji part appropriately?
|
56
|
+
# - There are some words like +行って+. Without the kana, it's difficult to
|
57
|
+
# determine what kana it should be. Should it be +いって+ or +おこなって+?
|
58
|
+
# - Similarly, if we just have +いって+, should this be +行って+ or +言って+?
|
59
|
+
# - Therefore, if we only have the kanji or only have the kana, we don't
|
60
|
+
# try to populate the other value.
|
61
|
+
def add_word(word,use_freq: false)
|
62
|
+
curr_word = words[word.key]
|
63
|
+
|
64
|
+
if curr_word.nil?()
|
65
|
+
words[word.key] = word
|
66
|
+
curr_word = word
|
67
|
+
else
|
68
|
+
curr_word.freq += (use_freq ? word.freq : 1)
|
69
|
+
|
70
|
+
curr_word.defn = word.defn if word.defn.to_s().length > curr_word.defn.to_s().length
|
71
|
+
curr_word.eng = word.eng if word.eng.to_s().length > curr_word.eng.to_s().length
|
72
|
+
end
|
73
|
+
|
74
|
+
return curr_word
|
75
|
+
end
|
76
|
+
|
77
|
+
def encode_with(coder)
|
78
|
+
# Order matters.
|
79
|
+
|
80
|
+
coder[:datetime] = @datetime.nil?() ? @datetime : @datetime.iso8601()
|
81
|
+
coder[:title] = @title
|
82
|
+
coder[:url] = @url
|
83
|
+
coder[:futsuurl] = @futsuurl
|
84
|
+
coder[:sha256] = @sha256
|
85
|
+
coder[:words] = @words
|
86
|
+
end
|
87
|
+
|
88
|
+
def self.load_data(key,hash)
|
89
|
+
datetime = hash[:datetime]
|
90
|
+
words = hash[:words]
|
91
|
+
|
92
|
+
article = Article.new()
|
93
|
+
|
94
|
+
article.datetime = Util.empty_web_str?(datetime) ? nil : Time.iso8601(datetime)
|
95
|
+
article.futsuurl = hash[:futsuurl]
|
96
|
+
article.sha256 = hash[:sha256]
|
97
|
+
article.title = hash[:title]
|
98
|
+
article.url = hash[:url]
|
99
|
+
|
100
|
+
if !words.nil?()
|
101
|
+
words.each() do |k,h|
|
102
|
+
k = k.to_s() # Change from a symbol
|
103
|
+
article.words[k] = Word.load_data(k,h)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
return article
|
108
|
+
end
|
109
|
+
|
110
|
+
def to_s(mini: false)
|
111
|
+
s = ''.dup()
|
112
|
+
|
113
|
+
s << "'#{@url}':"
|
114
|
+
s << "\n datetime: '#{@datetime}'"
|
115
|
+
s << "\n title: '#{@title}'"
|
116
|
+
s << "\n url: '#{@url}'"
|
117
|
+
s << "\n futsuurl: '#{@futsuurl}'"
|
118
|
+
s << "\n sha256: '#{@sha256}'"
|
119
|
+
|
120
|
+
if !mini
|
121
|
+
s << "\n words:"
|
122
|
+
@words.each() do |key,word|
|
123
|
+
s << "\n #{word}"
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
return s
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|