text_detector 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -0
- data/bin/benchmark +32 -16
- data/lib/text_detector/detector/base.rb +4 -0
- data/lib/text_detector/detector/regexp.rb +14 -2
- data/lib/text_detector/detector/simple.rb +20 -3
- data/lib/text_detector/executor.rb +5 -1
- data/lib/text_detector/version.rb +1 -1
- data/lib/text_detector.rb +5 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a7d68797d530b617238813bcadb52cf8649d13f6
|
4
|
+
data.tar.gz: f615523b8676171aebb035c2d4432a3c596f2126
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1b0263082fe4138421f104844389afdf78aef50b864d6dbd7c2021176296da7f8ec86e41241bc4a80ff4d461de0fb38c8dcc5bfc3ff6ca8dc6eaae885c8bd274
|
7
|
+
data.tar.gz: 0c542dde4eff06e4a5a80be840acbde7e9ac3b061e03e84aea633a4b28fb032fa1c1e056d1b84d45f02ddaab79640408b820ddbfc532571de306f3f3923785cc
|
data/README.md
CHANGED
data/bin/benchmark
CHANGED
@@ -51,39 +51,55 @@ regexp_detector = TextDetector.factory(:regexp, open('dictionary.txt'))
|
|
51
51
|
mini_simple_detector = TextDetector.factory(:simple, open('mini-dictionary.txt'))
|
52
52
|
mini_regexp_detector = TextDetector.factory(:regexp, open('mini-dictionary.txt'))
|
53
53
|
|
54
|
-
|
55
|
-
|
56
|
-
x.report do
|
54
|
+
Benchmark.bm(22) do |x|
|
55
|
+
x.report('simple detect') do
|
57
56
|
testdata.each do |text|
|
58
|
-
|
57
|
+
simple_detector.detect(text)
|
59
58
|
end
|
60
59
|
end
|
61
60
|
|
62
|
-
x.report do
|
61
|
+
x.report('regexp detect') do
|
63
62
|
testdata.each do |text|
|
64
|
-
|
63
|
+
regexp_detector.detect(text)
|
65
64
|
end
|
66
65
|
end
|
67
66
|
|
68
|
-
x.report do
|
67
|
+
x.report('mini simple detect') do
|
69
68
|
testdata.each do |text|
|
70
|
-
|
69
|
+
mini_simple_detector.detect(text)
|
71
70
|
end
|
72
71
|
end
|
73
72
|
|
74
|
-
x.report do
|
73
|
+
x.report('mini regexp detect') do
|
75
74
|
testdata.each do |text|
|
76
|
-
|
75
|
+
mini_regexp_detector.detect(text)
|
77
76
|
end
|
78
77
|
end
|
79
|
-
end
|
80
78
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
end
|
79
|
+
x.report('simple detect_all') do
|
80
|
+
testdata.each do |text|
|
81
|
+
simple_detector.detect_all(text)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
x.report('regexp detect_all') do
|
86
|
+
testdata.each do |text|
|
87
|
+
regexp_detector.detect_all(text)
|
88
|
+
end
|
89
|
+
end
|
86
90
|
|
91
|
+
x.report('mini simple detect_all') do
|
92
|
+
testdata.each do |text|
|
93
|
+
mini_simple_detector.detect_all(text)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
x.report('mini regexp detect_all') do
|
98
|
+
testdata.each do |text|
|
99
|
+
mini_regexp_detector.detect_all(text)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
87
103
|
|
88
104
|
__END__
|
89
105
|
吾輩《わがはい》は猫である。名前はまだ無い。
|
@@ -4,8 +4,20 @@ module TextDetector
|
|
4
4
|
module Detector
|
5
5
|
class Regexp < Base
|
6
6
|
def detect(text)
|
7
|
-
|
8
|
-
|
7
|
+
matched = @re.match(TextDetector.normalize(text))
|
8
|
+
if matched
|
9
|
+
offset = matched.offset(0)
|
10
|
+
text.slice(offset[0], offset[1] - offset[0])
|
11
|
+
else
|
12
|
+
nil
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def detect_all(text)
|
17
|
+
TextDetector.normalize(text).to_enum(:scan, @re).map do
|
18
|
+
offset = ::Regexp.last_match.offset(0)
|
19
|
+
text.slice(offset[0], offset[1] - offset[0])
|
20
|
+
end
|
9
21
|
end
|
10
22
|
|
11
23
|
protected
|
@@ -5,6 +5,20 @@ module TextDetector
|
|
5
5
|
# BM法っぽく(トライ木を調べてる時に見かけた実装を参考に)
|
6
6
|
class Simple < Base
|
7
7
|
def detect(text)
|
8
|
+
detect_n(text, 1).first
|
9
|
+
end
|
10
|
+
|
11
|
+
def detect_all(text)
|
12
|
+
detect_n(text)
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def detect_n(text, limit = nil)
|
18
|
+
results = []
|
19
|
+
original = text
|
20
|
+
text = TextDetector.normalize(original)
|
21
|
+
|
8
22
|
# 0文字目から末尾の一つ前まで一文字ずつ始点を移動していく
|
9
23
|
0.upto(text.size - 1) do |start|
|
10
24
|
# 語の長さ配列から切り出し文字数を取り出していく
|
@@ -13,12 +27,15 @@ module TextDetector
|
|
13
27
|
# 切り出した文字列の長さが、切り出し分より短ければ次のターン
|
14
28
|
break if size > target.size
|
15
29
|
|
16
|
-
#
|
17
|
-
|
30
|
+
# 切り出した文字列が辞書に含まれていれば記録
|
31
|
+
results << original[start, size] if dictionary.lookup(target)
|
32
|
+
|
33
|
+
# 制限数までヒットしたら探索終了
|
34
|
+
return results if limit && results.size == limit
|
18
35
|
end
|
19
36
|
end
|
20
37
|
|
21
|
-
|
38
|
+
results
|
22
39
|
end
|
23
40
|
end
|
24
41
|
end
|
@@ -8,7 +8,11 @@ module TextDetector
|
|
8
8
|
end
|
9
9
|
|
10
10
|
def detect(text)
|
11
|
-
@detector.detect(TextDetector.
|
11
|
+
@detector.detect(TextDetector.shallow_normalize(text))
|
12
|
+
end
|
13
|
+
|
14
|
+
def detect_all(text)
|
15
|
+
@detector.detect_all(TextDetector.shallow_normalize(text))
|
12
16
|
end
|
13
17
|
end
|
14
18
|
end
|
data/lib/text_detector.rb
CHANGED
@@ -7,7 +7,11 @@ module TextDetector
|
|
7
7
|
Executor.new type, dictionary
|
8
8
|
end
|
9
9
|
|
10
|
+
def self.shallow_normalize(text)
|
11
|
+
text.unicode_normalize(:nfc)
|
12
|
+
end
|
13
|
+
|
10
14
|
def self.normalize(text)
|
11
|
-
NKF.nkf('--katakana -w', text)
|
15
|
+
NKF.nkf('--katakana -w', shallow_normalize(text))
|
12
16
|
end
|
13
17
|
end
|