text_detector 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 527080011ed97c18c7c900d0b9abdf52ff75f0f9
4
- data.tar.gz: ef29428804f699210fac218d30dcb9ab8633820b
3
+ metadata.gz: a7d68797d530b617238813bcadb52cf8649d13f6
4
+ data.tar.gz: f615523b8676171aebb035c2d4432a3c596f2126
5
5
  SHA512:
6
- metadata.gz: ddaaecd37594997719d38c5fdc6d96cac0efb7b9187ea9e5dedb16dda16bf8a4b5cad05988204d1c859186840feb9771eb5964a7713144764a0c1f43237c40e5
7
- data.tar.gz: c6c3ebd4af514c0ae1f3d86a8625ba0dcdb4c25b69d3d20bf96e33157a03bfa8dff09f1607738534381694fc9feccb1285fe9e1384e5e3248409ab4fa5c2d6a8
6
+ metadata.gz: 1b0263082fe4138421f104844389afdf78aef50b864d6dbd7c2021176296da7f8ec86e41241bc4a80ff4d461de0fb38c8dcc5bfc3ff6ca8dc6eaae885c8bd274
7
+ data.tar.gz: 0c542dde4eff06e4a5a80be840acbde7e9ac3b061e03e84aea633a4b28fb032fa1c1e056d1b84d45f02ddaab79640408b820ddbfc532571de306f3f3923785cc
data/README.md CHANGED
@@ -1,3 +1,4 @@
1
+ [![Gem Version](https://badge.fury.io/rb/text_detector.svg)](http://badge.fury.io/rb/text_detector)
1
2
  [![Build Status](https://travis-ci.org/koshigoe/text_detector.svg)](https://travis-ci.org/koshigoe/text_detector)
2
3
 
3
4
  # TextDetector
data/bin/benchmark CHANGED
@@ -51,39 +51,55 @@ regexp_detector = TextDetector.factory(:regexp, open('dictionary.txt'))
51
51
  mini_simple_detector = TextDetector.factory(:simple, open('mini-dictionary.txt'))
52
52
  mini_regexp_detector = TextDetector.factory(:regexp, open('mini-dictionary.txt'))
53
53
 
54
- results = { simple: [], regexp: [] }
55
- Benchmark.bm do |x|
56
- x.report do
54
+ Benchmark.bm(22) do |x|
55
+ x.report('simple detect') do
57
56
  testdata.each do |text|
58
- results[:simple] << simple_detector.detect(text)
57
+ simple_detector.detect(text)
59
58
  end
60
59
  end
61
60
 
62
- x.report do
61
+ x.report('regexp detect') do
63
62
  testdata.each do |text|
64
- results[:regexp] << regexp_detector.detect(text)
63
+ regexp_detector.detect(text)
65
64
  end
66
65
  end
67
66
 
68
- x.report do
67
+ x.report('mini simple detect') do
69
68
  testdata.each do |text|
70
- results[:simple] << mini_simple_detector.detect(text)
69
+ mini_simple_detector.detect(text)
71
70
  end
72
71
  end
73
72
 
74
- x.report do
73
+ x.report('mini regexp detect') do
75
74
  testdata.each do |text|
76
- results[:regexp] << mini_regexp_detector.detect(text)
75
+ mini_regexp_detector.detect(text)
77
76
  end
78
77
  end
79
- end
80
78
 
81
- if results[:simple] == results[:regexp]
82
- puts 'ok'
83
- else
84
- puts 'ng'
85
- end
79
+ x.report('simple detect_all') do
80
+ testdata.each do |text|
81
+ simple_detector.detect_all(text)
82
+ end
83
+ end
84
+
85
+ x.report('regexp detect_all') do
86
+ testdata.each do |text|
87
+ regexp_detector.detect_all(text)
88
+ end
89
+ end
86
90
 
91
+ x.report('mini simple detect_all') do
92
+ testdata.each do |text|
93
+ mini_simple_detector.detect_all(text)
94
+ end
95
+ end
96
+
97
+ x.report('mini regexp detect_all') do
98
+ testdata.each do |text|
99
+ mini_regexp_detector.detect_all(text)
100
+ end
101
+ end
102
+ end
87
103
 
88
104
  __END__
89
105
   吾輩《わがはい》は猫である。名前はまだ無い。
@@ -12,6 +12,10 @@ module TextDetector
12
12
  raise NotImplementedError
13
13
  end
14
14
 
15
+ def detect_all(text)
16
+ raise NotImplementedError
17
+ end
18
+
15
19
  protected
16
20
 
17
21
  def setup
@@ -4,8 +4,20 @@ module TextDetector
4
4
  module Detector
5
5
  class Regexp < Base
6
6
  def detect(text)
7
- detected = @re.match(text)
8
- detected ? detected.to_s : nil
7
+ matched = @re.match(TextDetector.normalize(text))
8
+ if matched
9
+ offset = matched.offset(0)
10
+ text.slice(offset[0], offset[1] - offset[0])
11
+ else
12
+ nil
13
+ end
14
+ end
15
+
16
+ def detect_all(text)
17
+ TextDetector.normalize(text).to_enum(:scan, @re).map do
18
+ offset = ::Regexp.last_match.offset(0)
19
+ text.slice(offset[0], offset[1] - offset[0])
20
+ end
9
21
  end
10
22
 
11
23
  protected
@@ -5,6 +5,20 @@ module TextDetector
5
5
  # BM法っぽく(トライ木を調べてる時に見かけた実装を参考に)
6
6
  class Simple < Base
7
7
  def detect(text)
8
+ detect_n(text, 1).first
9
+ end
10
+
11
+ def detect_all(text)
12
+ detect_n(text)
13
+ end
14
+
15
+ private
16
+
17
+ def detect_n(text, limit = nil)
18
+ results = []
19
+ original = text
20
+ text = TextDetector.normalize(original)
21
+
8
22
  # 0文字目から末尾の一つ前まで一文字ずつ始点を移動していく
9
23
  0.upto(text.size - 1) do |start|
10
24
  # 語の長さ配列から切り出し文字数を取り出していく
@@ -13,12 +27,15 @@ module TextDetector
13
27
  # 切り出した文字列の長さが、切り出し分より短ければ次のターン
14
28
  break if size > target.size
15
29
 
16
- # 切り出した文字列が辞書に含まれていれば探索終了
17
- return target if dictionary.lookup(target)
30
+ # 切り出した文字列が辞書に含まれていれば記録
31
+ results << original[start, size] if dictionary.lookup(target)
32
+
33
+ # 制限数までヒットしたら探索終了
34
+ return results if limit && results.size == limit
18
35
  end
19
36
  end
20
37
 
21
- nil
38
+ results
22
39
  end
23
40
  end
24
41
  end
@@ -8,7 +8,11 @@ module TextDetector
8
8
  end
9
9
 
10
10
  def detect(text)
11
- @detector.detect(TextDetector.normalize(text))
11
+ @detector.detect(TextDetector.shallow_normalize(text))
12
+ end
13
+
14
+ def detect_all(text)
15
+ @detector.detect_all(TextDetector.shallow_normalize(text))
12
16
  end
13
17
  end
14
18
  end
@@ -1,3 +1,3 @@
1
1
  module TextDetector
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
data/lib/text_detector.rb CHANGED
@@ -7,7 +7,11 @@ module TextDetector
7
7
  Executor.new type, dictionary
8
8
  end
9
9
 
10
+ def self.shallow_normalize(text)
11
+ text.unicode_normalize(:nfc)
12
+ end
13
+
10
14
  def self.normalize(text)
11
- NKF.nkf('--katakana -w', text).unicode_normalize(:nfc)
15
+ NKF.nkf('--katakana -w', shallow_normalize(text))
12
16
  end
13
17
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_detector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - koshigoe