datte 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/datte/data.txt +16020 -0
- data/lib/datte/date_parser.rb +26 -13
- data/lib/datte/datetime_table.rb +18 -10
- data/lib/datte/dattetime.rb +40 -15
- data/lib/datte/train.rb +203 -0
- data/lib/datte/version.rb +1 -1
- data/lib/datte.rb +1 -0
- data/spec/datte_spec.rb +8 -2
- metadata +3 -1
data/lib/datte/date_parser.rb
CHANGED
@@ -19,23 +19,36 @@ module Datte
|
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
22
|
-
|
22
|
+
ABSOLUTE_TIMES.each do |matcher|
|
23
|
+
if md = @body.match(matcher)
|
24
|
+
@date.update_time(md)
|
25
|
+
p @date
|
26
|
+
break
|
27
|
+
end
|
28
|
+
end
|
23
29
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
30
|
+
NOUNS.each do |matcher_s, method|
|
31
|
+
matcher = Regexp.new(matcher_s.to_s)
|
32
|
+
if md = @body.match(matcher)
|
33
|
+
eval(method)
|
34
|
+
break
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
AFTERS.each do |matcher|
|
39
|
+
if md = @body.match(matcher)
|
40
|
+
@date.after(md)
|
41
|
+
break
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
|
46
|
+
return @date.to_datetime
|
36
47
|
end
|
37
48
|
|
38
49
|
private
|
39
50
|
|
51
|
+
def next_day(day)
|
52
|
+
end
|
40
53
|
end
|
41
54
|
end
|
data/lib/datte/datetime_table.rb
CHANGED
@@ -37,17 +37,25 @@ module Datte
|
|
37
37
|
'(?<min>\d{1,2})分'
|
38
38
|
].map { |pattern| Regexp.compile(pattern) }.freeze
|
39
39
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
40
|
+
NOUNS = {
|
41
|
+
'明日|あした|あす': 'send(:next_day, 1)',
|
42
|
+
'明後日|あさって': 'send(:next_day, 2)',
|
43
|
+
'明々後日|しあさって': 'send(:next_day, 3)',
|
44
|
+
'今日|きょう': 'send(:next_day, 0)'
|
45
|
+
}
|
45
46
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
47
|
+
AFTERS = [
|
48
|
+
# 何年後
|
49
|
+
'(?<year>\d{1)年後',
|
50
|
+
# 何ヶ月後
|
51
|
+
'(?<month>\d{1,2}ヶ月後)',
|
52
|
+
# 何日後
|
53
|
+
'(?<day>\d{1,2})日後',
|
54
|
+
# 何時間後
|
55
|
+
'(?<hour>\d{1,2})時間後',
|
56
|
+
# 何分後
|
57
|
+
'(?<min>\d{1,2})分後'
|
58
|
+
].map { |pattern| Regexp.compile(pattern) }.freeze
|
51
59
|
|
52
60
|
class DatetimeTable
|
53
61
|
def initialize
|
data/lib/datte/dattetime.rb
CHANGED
@@ -2,22 +2,20 @@ module Datte
|
|
2
2
|
class Dattetime
|
3
3
|
|
4
4
|
DEFAULT_OPTIONS = {
|
5
|
-
force_update: false
|
5
|
+
force_update: false,
|
6
|
+
level: 1
|
6
7
|
}
|
7
8
|
|
8
|
-
attr_reader :year, :month, :day, :hour, :min
|
9
|
+
attr_reader :year, :month, :day, :hour, :min
|
9
10
|
|
10
11
|
def initialize(options = {})
|
11
12
|
@options = DEFAULT_OPTIONS.merge(options)
|
12
|
-
@date =
|
13
|
-
end
|
14
|
-
|
15
|
-
def to_s
|
16
|
-
@date.to_s
|
13
|
+
@date = DateTime.now
|
17
14
|
end
|
18
15
|
|
19
16
|
def to_datetime
|
20
|
-
|
17
|
+
return nil unless check_level?
|
18
|
+
DateTime.new(y, m, d, h, mi, 0) rescue nil
|
21
19
|
end
|
22
20
|
|
23
21
|
# 年か月か日を更新
|
@@ -29,18 +27,29 @@ module Datte
|
|
29
27
|
end
|
30
28
|
|
31
29
|
# 時か分を更新
|
32
|
-
def update_time(
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
def after_date(year, month, day)
|
30
|
+
def update_time(md, options = @options)
|
31
|
+
op = @options[:force_update] ? '=' : '||='
|
32
|
+
eval("@hour #{op} hour!(md)")
|
33
|
+
eval("@min #{op} min!(md)")
|
37
34
|
end
|
38
35
|
|
39
|
-
#
|
40
|
-
def
|
36
|
+
# 何年後、何ヶ月後、何日後, 何時間後, 何分後
|
37
|
+
def after(md)
|
38
|
+
@date >> (md[:year].to_i * 12) if md.matched?(:year) # 何年後
|
39
|
+
@date >> md[:month].to_i if md.matched?(:month) # 何ヶ月後
|
40
|
+
@date + md[:day].to_i if md.matched?(:day) # 何日後
|
41
|
+
@date + Rational(md[:hour].to_i, 24) # 何時間後
|
42
|
+
@date + Rational(md[:hour].to_i, 24 * 60) # 何分後
|
41
43
|
end
|
42
44
|
|
43
45
|
private
|
46
|
+
|
47
|
+
def y; @year || now[:year] end
|
48
|
+
def m; @month || now[:month] end
|
49
|
+
def d; @day || now[:day] end
|
50
|
+
def h; @hour || now[:hour] end
|
51
|
+
def mi; @min || 0 end
|
52
|
+
|
44
53
|
def now
|
45
54
|
d = DateTime.now
|
46
55
|
{ year: d.year, month: d.month, day: d.day, hour: d.hour, min: d.min }
|
@@ -57,6 +66,22 @@ module Datte
|
|
57
66
|
def day!(md)
|
58
67
|
md.matched?(:day) ? md[:day].to_i : now[:day]
|
59
68
|
end
|
69
|
+
|
70
|
+
def hour!(md)
|
71
|
+
md.matched?(:hour) ? md[:hour].to_i : now[:hour]
|
72
|
+
end
|
73
|
+
|
74
|
+
def min!(md)
|
75
|
+
md.matched?(:min) ? md[:min].to_i : 0
|
76
|
+
end
|
77
|
+
|
78
|
+
def check_level?
|
79
|
+
counter = 0
|
80
|
+
[@year, @month, @day, @hour, @min].each do |check|
|
81
|
+
counter += 1 unless check.nil?
|
82
|
+
end
|
83
|
+
@options[:level] < counter
|
84
|
+
end
|
60
85
|
end
|
61
86
|
end
|
62
87
|
|
data/lib/datte/train.rb
ADDED
@@ -0,0 +1,203 @@
|
|
1
|
+
module Datte
|
2
|
+
class Train
|
3
|
+
FNAME = File.join(File.dirname(__FILE__), 'data.txt')
|
4
|
+
|
5
|
+
# SEE: http://qiita.com/Hironsan/items/326b66711eb4196aa9d4
|
6
|
+
|
7
|
+
def initialize(path = FNAME)
|
8
|
+
train_sents = corpus_read
|
9
|
+
x = corpus_read[0]
|
10
|
+
p x
|
11
|
+
p '==='
|
12
|
+
p sent2features(x)
|
13
|
+
#sent2features(corpus_read[0])[0]
|
14
|
+
#sent2features(train_sents[0])[0]
|
15
|
+
end
|
16
|
+
|
17
|
+
def x_train
|
18
|
+
train_sents.each do |s|
|
19
|
+
return sent2features(s)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def y_train
|
24
|
+
train_sents.each do |s|
|
25
|
+
return sent2labels(s)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def x_test
|
30
|
+
test_sents.each do |s|
|
31
|
+
return sent2features(s)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def y_test
|
36
|
+
test_sents.each do |s|
|
37
|
+
return sent2labels(s)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def train()
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
def hiragana?(ch)
|
46
|
+
0x3040 <= ch.ord && ch.ord <= 0x309F
|
47
|
+
end
|
48
|
+
|
49
|
+
def katakana(ch)
|
50
|
+
0x30A0 <= ch.ord && ch.ord <= 0x30FF
|
51
|
+
end
|
52
|
+
|
53
|
+
def space?(ch)
|
54
|
+
!(ch =~ /^\s*$/).nil?
|
55
|
+
end
|
56
|
+
|
57
|
+
def integer?(ch)
|
58
|
+
Integer(ch)
|
59
|
+
Integer(ch)
|
60
|
+
true
|
61
|
+
rescue ArgumentError
|
62
|
+
false
|
63
|
+
end
|
64
|
+
|
65
|
+
def lower?(ch)
|
66
|
+
ch == ch.downcase
|
67
|
+
end
|
68
|
+
|
69
|
+
def upper?(ch)
|
70
|
+
ch == ch.upcase
|
71
|
+
end
|
72
|
+
|
73
|
+
def chara_type(ch)
|
74
|
+
if space?(ch) then 'ZSPACE'
|
75
|
+
elsif integer?(ch) then 'ZDIGIT'
|
76
|
+
elsif lower?(ch) then 'ZLLET'
|
77
|
+
elsif upper?(ch) then 'ZULET'
|
78
|
+
elsif hiragana?(ch) then 'HIRAG'
|
79
|
+
elsif katakana?(ch) then 'KATAK'
|
80
|
+
else 'OTHER'
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def chara_types(str)
|
85
|
+
types = str.each_char.to_a.map do |ch|
|
86
|
+
chara_type(ch)
|
87
|
+
end
|
88
|
+
types.uniq.sort().join('-')
|
89
|
+
end
|
90
|
+
|
91
|
+
def extract_pos(morph)
|
92
|
+
idx = morph.index('*')
|
93
|
+
morph[1, idx-1].join('-')
|
94
|
+
end
|
95
|
+
|
96
|
+
def word2features(sent, i)
|
97
|
+
word = sent[i][0]
|
98
|
+
chtype = chara_types(sent[i][0])
|
99
|
+
postag = extract_pos(sent[i])
|
100
|
+
|
101
|
+
features = [
|
102
|
+
'bias',
|
103
|
+
'word=' + word,
|
104
|
+
'type=' + chtype,
|
105
|
+
'pos_tag=' + postag
|
106
|
+
]
|
107
|
+
|
108
|
+
if i >= 2
|
109
|
+
word2 = sent[i-2][0]
|
110
|
+
chtype2 = chara_types(sent[i-2][0])
|
111
|
+
postag2 = extract_pos(sent[i-2])
|
112
|
+
iobtag2 = sent[i-2][-1]
|
113
|
+
features.push(*[
|
114
|
+
'-2:word=' + word2,
|
115
|
+
'-2:type=' + chtype2,
|
116
|
+
'-2:postag=' + postag2,
|
117
|
+
'-2:iobtag=' + iobtag2
|
118
|
+
])
|
119
|
+
else
|
120
|
+
features.push('BOS')
|
121
|
+
end
|
122
|
+
|
123
|
+
if i >= 1
|
124
|
+
word1 = sent[i-1][0]
|
125
|
+
chtype1 = chara_types(sent[i-1][0])
|
126
|
+
postag1 = extract_pos(sent[i-1])
|
127
|
+
iobtag1 = sent[i-1][-1]
|
128
|
+
features.push(*[
|
129
|
+
'-1:word=' + word1,
|
130
|
+
'-1:type=' + chtype1,
|
131
|
+
'-1:postag=' + postag1,
|
132
|
+
'-1:iobtag=' + iobtag1
|
133
|
+
])
|
134
|
+
else
|
135
|
+
features.push('BOS')
|
136
|
+
end
|
137
|
+
|
138
|
+
if i < sent.length - 1
|
139
|
+
word1 = sent[i+1][0]
|
140
|
+
chtype1 = chara_types(sent[i+1][0])
|
141
|
+
postag1 = extract_pos(sent[i+1])
|
142
|
+
features.push(*[
|
143
|
+
'+1:word=' + word1,
|
144
|
+
'+1:type=' + chtype1,
|
145
|
+
'+1:postag=' + postag1
|
146
|
+
])
|
147
|
+
else
|
148
|
+
features.push('EOS')
|
149
|
+
end
|
150
|
+
|
151
|
+
if i < sent.length - 2
|
152
|
+
word2 = sent[i+2][0]
|
153
|
+
chtype2 = chara_types(sent[i+2][0])
|
154
|
+
postag2 = extract_pos(sent[i+2])
|
155
|
+
features.push(*[
|
156
|
+
'+2:word=' + word2,
|
157
|
+
'+2:type=' + chtype2,
|
158
|
+
'+2:postag=' + postag2
|
159
|
+
])
|
160
|
+
else
|
161
|
+
features.push('EOS')
|
162
|
+
end
|
163
|
+
|
164
|
+
return features
|
165
|
+
end
|
166
|
+
|
167
|
+
def sent2features(sent)
|
168
|
+
(0..(sent.length)).to_a.map do |i|
|
169
|
+
return word2features(sent, i)
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
def sent2labels(sent)
|
174
|
+
sent.map do |morph|
|
175
|
+
morph[-1]
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
def sent2tokens(sent)
|
180
|
+
sent.map do |morph|
|
181
|
+
morph[0]
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
def corpus_read
|
186
|
+
sents, sent = [], []
|
187
|
+
|
188
|
+
File.open(FNAME, 'r') do |file|
|
189
|
+
file.each_line do |line|
|
190
|
+
if line == "\n"
|
191
|
+
sents.push(sent)
|
192
|
+
sent = []
|
193
|
+
next
|
194
|
+
end
|
195
|
+
morph_info = line.strip().split(' ')
|
196
|
+
sent.push(morph_info)
|
197
|
+
end
|
198
|
+
end
|
199
|
+
sents
|
200
|
+
end
|
201
|
+
|
202
|
+
end
|
203
|
+
end
|
data/lib/datte/version.rb
CHANGED
data/lib/datte.rb
CHANGED
@@ -4,3 +4,4 @@ require File.join(File.dirname(__FILE__), 'datte', 'parser')
|
|
4
4
|
require File.join(File.dirname(__FILE__), 'datte', 'date_parser')
|
5
5
|
require File.join(File.dirname(__FILE__), 'datte', 'dattetime')
|
6
6
|
require File.join(File.dirname(__FILE__), 'datte', 'datetime_table')
|
7
|
+
require File.join(File.dirname(__FILE__), 'datte', 'train')
|
data/spec/datte_spec.rb
CHANGED
@@ -5,7 +5,13 @@ describe Datte do
|
|
5
5
|
expect(Datte::VERSION).not_to be nil
|
6
6
|
end
|
7
7
|
|
8
|
-
|
9
|
-
|
8
|
+
describe "::Parser" do
|
9
|
+
datte = Datte::Parser.new
|
10
|
+
|
11
|
+
it "2016/11/1日に遊ぼー" do
|
12
|
+
body = "2016/11/1日に遊ぼー"
|
13
|
+
d = DateTime.now
|
14
|
+
expect(datte.parse_date(body)).to eq(DateTime.new(2016, 11, 1, d.hour, 0, 0))
|
15
|
+
end
|
10
16
|
end
|
11
17
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datte
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- pokohide
|
@@ -73,10 +73,12 @@ files:
|
|
73
73
|
- bin/setup
|
74
74
|
- datte.gemspec
|
75
75
|
- lib/datte.rb
|
76
|
+
- lib/datte/data.txt
|
76
77
|
- lib/datte/date_parser.rb
|
77
78
|
- lib/datte/datetime_table.rb
|
78
79
|
- lib/datte/dattetime.rb
|
79
80
|
- lib/datte/parser.rb
|
81
|
+
- lib/datte/train.rb
|
80
82
|
- lib/datte/version.rb
|
81
83
|
- spec/datte_spec.rb
|
82
84
|
- spec/spec_helper.rb
|