text_detector 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -0
- data/bin/benchmark +32 -16
- data/lib/text_detector/detector/base.rb +4 -0
- data/lib/text_detector/detector/regexp.rb +14 -2
- data/lib/text_detector/detector/simple.rb +20 -3
- data/lib/text_detector/executor.rb +5 -1
- data/lib/text_detector/version.rb +1 -1
- data/lib/text_detector.rb +5 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a7d68797d530b617238813bcadb52cf8649d13f6
|
4
|
+
data.tar.gz: f615523b8676171aebb035c2d4432a3c596f2126
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1b0263082fe4138421f104844389afdf78aef50b864d6dbd7c2021176296da7f8ec86e41241bc4a80ff4d461de0fb38c8dcc5bfc3ff6ca8dc6eaae885c8bd274
|
7
|
+
data.tar.gz: 0c542dde4eff06e4a5a80be840acbde7e9ac3b061e03e84aea633a4b28fb032fa1c1e056d1b84d45f02ddaab79640408b820ddbfc532571de306f3f3923785cc
|
data/README.md
CHANGED
data/bin/benchmark
CHANGED
@@ -51,39 +51,55 @@ regexp_detector = TextDetector.factory(:regexp, open('dictionary.txt'))
|
|
51
51
|
mini_simple_detector = TextDetector.factory(:simple, open('mini-dictionary.txt'))
|
52
52
|
mini_regexp_detector = TextDetector.factory(:regexp, open('mini-dictionary.txt'))
|
53
53
|
|
54
|
-
|
55
|
-
|
56
|
-
x.report do
|
54
|
+
Benchmark.bm(22) do |x|
|
55
|
+
x.report('simple detect') do
|
57
56
|
testdata.each do |text|
|
58
|
-
|
57
|
+
simple_detector.detect(text)
|
59
58
|
end
|
60
59
|
end
|
61
60
|
|
62
|
-
x.report do
|
61
|
+
x.report('regexp detect') do
|
63
62
|
testdata.each do |text|
|
64
|
-
|
63
|
+
regexp_detector.detect(text)
|
65
64
|
end
|
66
65
|
end
|
67
66
|
|
68
|
-
x.report do
|
67
|
+
x.report('mini simple detect') do
|
69
68
|
testdata.each do |text|
|
70
|
-
|
69
|
+
mini_simple_detector.detect(text)
|
71
70
|
end
|
72
71
|
end
|
73
72
|
|
74
|
-
x.report do
|
73
|
+
x.report('mini regexp detect') do
|
75
74
|
testdata.each do |text|
|
76
|
-
|
75
|
+
mini_regexp_detector.detect(text)
|
77
76
|
end
|
78
77
|
end
|
79
|
-
end
|
80
78
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
end
|
79
|
+
x.report('simple detect_all') do
|
80
|
+
testdata.each do |text|
|
81
|
+
simple_detector.detect_all(text)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
x.report('regexp detect_all') do
|
86
|
+
testdata.each do |text|
|
87
|
+
regexp_detector.detect_all(text)
|
88
|
+
end
|
89
|
+
end
|
86
90
|
|
91
|
+
x.report('mini simple detect_all') do
|
92
|
+
testdata.each do |text|
|
93
|
+
mini_simple_detector.detect_all(text)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
x.report('mini regexp detect_all') do
|
98
|
+
testdata.each do |text|
|
99
|
+
mini_regexp_detector.detect_all(text)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
87
103
|
|
88
104
|
__END__
|
89
105
|
吾輩《わがはい》は猫である。名前はまだ無い。
|
@@ -4,8 +4,20 @@ module TextDetector
|
|
4
4
|
module Detector
|
5
5
|
class Regexp < Base
|
6
6
|
def detect(text)
|
7
|
-
|
8
|
-
|
7
|
+
matched = @re.match(TextDetector.normalize(text))
|
8
|
+
if matched
|
9
|
+
offset = matched.offset(0)
|
10
|
+
text.slice(offset[0], offset[1] - offset[0])
|
11
|
+
else
|
12
|
+
nil
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def detect_all(text)
|
17
|
+
TextDetector.normalize(text).to_enum(:scan, @re).map do
|
18
|
+
offset = ::Regexp.last_match.offset(0)
|
19
|
+
text.slice(offset[0], offset[1] - offset[0])
|
20
|
+
end
|
9
21
|
end
|
10
22
|
|
11
23
|
protected
|
@@ -5,6 +5,20 @@ module TextDetector
|
|
5
5
|
# BM法っぽく(トライ木を調べてる時に見かけた実装を参考に)
|
6
6
|
class Simple < Base
|
7
7
|
def detect(text)
|
8
|
+
detect_n(text, 1).first
|
9
|
+
end
|
10
|
+
|
11
|
+
def detect_all(text)
|
12
|
+
detect_n(text)
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def detect_n(text, limit = nil)
|
18
|
+
results = []
|
19
|
+
original = text
|
20
|
+
text = TextDetector.normalize(original)
|
21
|
+
|
8
22
|
# 0文字目から末尾の一つ前まで一文字ずつ始点を移動していく
|
9
23
|
0.upto(text.size - 1) do |start|
|
10
24
|
# 語の長さ配列から切り出し文字数を取り出していく
|
@@ -13,12 +27,15 @@ module TextDetector
|
|
13
27
|
# 切り出した文字列の長さが、切り出し分より短ければ次のターン
|
14
28
|
break if size > target.size
|
15
29
|
|
16
|
-
#
|
17
|
-
|
30
|
+
# 切り出した文字列が辞書に含まれていれば記録
|
31
|
+
results << original[start, size] if dictionary.lookup(target)
|
32
|
+
|
33
|
+
# 制限数までヒットしたら探索終了
|
34
|
+
return results if limit && results.size == limit
|
18
35
|
end
|
19
36
|
end
|
20
37
|
|
21
|
-
|
38
|
+
results
|
22
39
|
end
|
23
40
|
end
|
24
41
|
end
|
@@ -8,7 +8,11 @@ module TextDetector
|
|
8
8
|
end
|
9
9
|
|
10
10
|
def detect(text)
|
11
|
-
@detector.detect(TextDetector.
|
11
|
+
@detector.detect(TextDetector.shallow_normalize(text))
|
12
|
+
end
|
13
|
+
|
14
|
+
def detect_all(text)
|
15
|
+
@detector.detect_all(TextDetector.shallow_normalize(text))
|
12
16
|
end
|
13
17
|
end
|
14
18
|
end
|
data/lib/text_detector.rb
CHANGED
@@ -7,7 +7,11 @@ module TextDetector
|
|
7
7
|
Executor.new type, dictionary
|
8
8
|
end
|
9
9
|
|
10
|
+
def self.shallow_normalize(text)
|
11
|
+
text.unicode_normalize(:nfc)
|
12
|
+
end
|
13
|
+
|
10
14
|
def self.normalize(text)
|
11
|
-
NKF.nkf('--katakana -w', text)
|
15
|
+
NKF.nkf('--katakana -w', shallow_normalize(text))
|
12
16
|
end
|
13
17
|
end
|