text_detector 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 527080011ed97c18c7c900d0b9abdf52ff75f0f9
4
- data.tar.gz: ef29428804f699210fac218d30dcb9ab8633820b
3
+ metadata.gz: a7d68797d530b617238813bcadb52cf8649d13f6
4
+ data.tar.gz: f615523b8676171aebb035c2d4432a3c596f2126
5
5
  SHA512:
6
- metadata.gz: ddaaecd37594997719d38c5fdc6d96cac0efb7b9187ea9e5dedb16dda16bf8a4b5cad05988204d1c859186840feb9771eb5964a7713144764a0c1f43237c40e5
7
- data.tar.gz: c6c3ebd4af514c0ae1f3d86a8625ba0dcdb4c25b69d3d20bf96e33157a03bfa8dff09f1607738534381694fc9feccb1285fe9e1384e5e3248409ab4fa5c2d6a8
6
+ metadata.gz: 1b0263082fe4138421f104844389afdf78aef50b864d6dbd7c2021176296da7f8ec86e41241bc4a80ff4d461de0fb38c8dcc5bfc3ff6ca8dc6eaae885c8bd274
7
+ data.tar.gz: 0c542dde4eff06e4a5a80be840acbde7e9ac3b061e03e84aea633a4b28fb032fa1c1e056d1b84d45f02ddaab79640408b820ddbfc532571de306f3f3923785cc
data/README.md CHANGED
@@ -1,3 +1,4 @@
1
+ [![Gem Version](https://badge.fury.io/rb/text_detector.svg)](http://badge.fury.io/rb/text_detector)
1
2
  [![Build Status](https://travis-ci.org/koshigoe/text_detector.svg)](https://travis-ci.org/koshigoe/text_detector)
2
3
 
3
4
  # TextDetector
data/bin/benchmark CHANGED
@@ -51,39 +51,55 @@ regexp_detector = TextDetector.factory(:regexp, open('dictionary.txt'))
51
51
  mini_simple_detector = TextDetector.factory(:simple, open('mini-dictionary.txt'))
52
52
  mini_regexp_detector = TextDetector.factory(:regexp, open('mini-dictionary.txt'))
53
53
 
54
- results = { simple: [], regexp: [] }
55
- Benchmark.bm do |x|
56
- x.report do
54
+ Benchmark.bm(22) do |x|
55
+ x.report('simple detect') do
57
56
  testdata.each do |text|
58
- results[:simple] << simple_detector.detect(text)
57
+ simple_detector.detect(text)
59
58
  end
60
59
  end
61
60
 
62
- x.report do
61
+ x.report('regexp detect') do
63
62
  testdata.each do |text|
64
- results[:regexp] << regexp_detector.detect(text)
63
+ regexp_detector.detect(text)
65
64
  end
66
65
  end
67
66
 
68
- x.report do
67
+ x.report('mini simple detect') do
69
68
  testdata.each do |text|
70
- results[:simple] << mini_simple_detector.detect(text)
69
+ mini_simple_detector.detect(text)
71
70
  end
72
71
  end
73
72
 
74
- x.report do
73
+ x.report('mini regexp detect') do
75
74
  testdata.each do |text|
76
- results[:regexp] << mini_regexp_detector.detect(text)
75
+ mini_regexp_detector.detect(text)
77
76
  end
78
77
  end
79
- end
80
78
 
81
- if results[:simple] == results[:regexp]
82
- puts 'ok'
83
- else
84
- puts 'ng'
85
- end
79
+ x.report('simple detect_all') do
80
+ testdata.each do |text|
81
+ simple_detector.detect_all(text)
82
+ end
83
+ end
84
+
85
+ x.report('regexp detect_all') do
86
+ testdata.each do |text|
87
+ regexp_detector.detect_all(text)
88
+ end
89
+ end
86
90
 
91
+ x.report('mini simple detect_all') do
92
+ testdata.each do |text|
93
+ mini_simple_detector.detect_all(text)
94
+ end
95
+ end
96
+
97
+ x.report('mini regexp detect_all') do
98
+ testdata.each do |text|
99
+ mini_regexp_detector.detect_all(text)
100
+ end
101
+ end
102
+ end
87
103
 
88
104
  __END__
89
105
   吾輩《わがはい》は猫である。名前はまだ無い。
@@ -12,6 +12,10 @@ module TextDetector
12
12
  raise NotImplementedError
13
13
  end
14
14
 
15
+ def detect_all(text)
16
+ raise NotImplementedError
17
+ end
18
+
15
19
  protected
16
20
 
17
21
  def setup
@@ -4,8 +4,20 @@ module TextDetector
4
4
  module Detector
5
5
  class Regexp < Base
6
6
  def detect(text)
7
- detected = @re.match(text)
8
- detected ? detected.to_s : nil
7
+ matched = @re.match(TextDetector.normalize(text))
8
+ if matched
9
+ offset = matched.offset(0)
10
+ text.slice(offset[0], offset[1] - offset[0])
11
+ else
12
+ nil
13
+ end
14
+ end
15
+
16
+ def detect_all(text)
17
+ TextDetector.normalize(text).to_enum(:scan, @re).map do
18
+ offset = ::Regexp.last_match.offset(0)
19
+ text.slice(offset[0], offset[1] - offset[0])
20
+ end
9
21
  end
10
22
 
11
23
  protected
@@ -5,6 +5,20 @@ module TextDetector
5
5
  # BM法っぽく(トライ木を調べてる時に見かけた実装を参考に)
6
6
  class Simple < Base
7
7
  def detect(text)
8
+ detect_n(text, 1).first
9
+ end
10
+
11
+ def detect_all(text)
12
+ detect_n(text)
13
+ end
14
+
15
+ private
16
+
17
+ def detect_n(text, limit = nil)
18
+ results = []
19
+ original = text
20
+ text = TextDetector.normalize(original)
21
+
8
22
  # 0文字目から末尾の一つ前まで一文字ずつ始点を移動していく
9
23
  0.upto(text.size - 1) do |start|
10
24
  # 語の長さ配列から切り出し文字数を取り出していく
@@ -13,12 +27,15 @@ module TextDetector
13
27
  # 切り出した文字列の長さが、切り出し分より短ければ次のターン
14
28
  break if size > target.size
15
29
 
16
- # 切り出した文字列が辞書に含まれていれば探索終了
17
- return target if dictionary.lookup(target)
30
+ # 切り出した文字列が辞書に含まれていれば記録
31
+ results << original[start, size] if dictionary.lookup(target)
32
+
33
+ # 制限数までヒットしたら探索終了
34
+ return results if limit && results.size == limit
18
35
  end
19
36
  end
20
37
 
21
- nil
38
+ results
22
39
  end
23
40
  end
24
41
  end
@@ -8,7 +8,11 @@ module TextDetector
8
8
  end
9
9
 
10
10
  def detect(text)
11
- @detector.detect(TextDetector.normalize(text))
11
+ @detector.detect(TextDetector.shallow_normalize(text))
12
+ end
13
+
14
+ def detect_all(text)
15
+ @detector.detect_all(TextDetector.shallow_normalize(text))
12
16
  end
13
17
  end
14
18
  end
@@ -1,3 +1,3 @@
1
1
  module TextDetector
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
data/lib/text_detector.rb CHANGED
@@ -7,7 +7,11 @@ module TextDetector
7
7
  Executor.new type, dictionary
8
8
  end
9
9
 
10
+ def self.shallow_normalize(text)
11
+ text.unicode_normalize(:nfc)
12
+ end
13
+
10
14
  def self.normalize(text)
11
- NKF.nkf('--katakana -w', text).unicode_normalize(:nfc)
15
+ NKF.nkf('--katakana -w', shallow_normalize(text))
12
16
  end
13
17
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_detector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - koshigoe