greeb 0.1.0.rc3 → 0.1.0.rc4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +5 -0
- data/lib/greeb/segmentator.rb +67 -0
- data/lib/greeb/version.rb +1 -1
- data/spec/segmentator_spec.rb +20 -1
- metadata +3 -3
data/README.md
CHANGED
@@ -132,6 +132,11 @@ systematic and awesome.
|
|
132
132
|
|
133
133
|
## Build Status [<img src="https://secure.travis-ci.org/eveel/greeb.png"/>](http://travis-ci.org/eveel/greeb)
|
134
134
|
|
135
|
+
If you're using [Rubinius](http://rubini.us) please note that it has the
|
136
|
+
incompatible `StringScanner` implementation. More information can be
|
137
|
+
provided under the following link:
|
138
|
+
<https://github.com/rubinius/rubinius/issues/1808>.
|
139
|
+
|
135
140
|
## Dependency Status [<img src="https://gemnasium.com/eveel/greeb.png?travis"/>](https://gemnasium.com/eveel/greeb)
|
136
141
|
|
137
142
|
## Copyright
|
data/lib/greeb/segmentator.rb
CHANGED
@@ -33,6 +33,15 @@ class Greeb::Segmentator
|
|
33
33
|
@sentences
|
34
34
|
end
|
35
35
|
|
36
|
+
# Subsentences memoization method.
|
37
|
+
#
|
38
|
+
# @return [Set<Greeb::Entity>] a set of subsentences.
|
39
|
+
#
|
40
|
+
def subsentences
|
41
|
+
detect_subsentences! unless @subsentences
|
42
|
+
@subsentences
|
43
|
+
end
|
44
|
+
|
36
45
|
# Extract tokens from the set of sentences.
|
37
46
|
#
|
38
47
|
# @param sentences [Array<Greeb::Entity>] a list of sentences.
|
@@ -48,6 +57,21 @@ class Greeb::Segmentator
|
|
48
57
|
]
|
49
58
|
end
|
50
59
|
|
60
|
+
# Extract subsentences from the set of sentences.
|
61
|
+
#
|
62
|
+
# @param sentences [Array<Greeb::Entity>] a list of sentences.
|
63
|
+
#
|
64
|
+
# @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
|
65
|
+
# sentences as keys and subsentences arrays as values.
|
66
|
+
#
|
67
|
+
def subextract *sentences
|
68
|
+
Hash[
|
69
|
+
sentences.map do |s|
|
70
|
+
[s, subsentences.select { |ss| ss.from >= s.from and ss.to <= s.to }]
|
71
|
+
end
|
72
|
+
]
|
73
|
+
end
|
74
|
+
|
51
75
|
protected
|
52
76
|
# Implementation of the sentence detection method. This method
|
53
77
|
# changes the `@sentences` ivar.
|
@@ -84,6 +108,41 @@ class Greeb::Segmentator
|
|
84
108
|
nil.tap { @sentences << rest if rest.from and rest.to }
|
85
109
|
end
|
86
110
|
|
111
|
+
# Implementation of the subsentence detection method. This method
|
112
|
+
# changes the `@subsentences` ivar.
|
113
|
+
#
|
114
|
+
# @return [nil] nothing.
|
115
|
+
#
|
116
|
+
def detect_subsentences!
|
117
|
+
@subsentences = SortedSet.new
|
118
|
+
|
119
|
+
rest = tokens.inject(new_subsentence) do |subsentence, token|
|
120
|
+
if !subsentence.from and SENTENCE_DOESNT_START.include?(token.type)
|
121
|
+
next subsentence
|
122
|
+
end
|
123
|
+
|
124
|
+
subsentence.from = token.from unless subsentence.from
|
125
|
+
|
126
|
+
next subsentence if subsentence.to and subsentence.to > token.to
|
127
|
+
|
128
|
+
if [:punct, :spunct].include? token.type
|
129
|
+
subsentence.to = tokens.
|
130
|
+
select { |t| t.from >= token.from }.
|
131
|
+
inject(token) { |r, t| break r if t.type != token.type; t }.
|
132
|
+
to
|
133
|
+
|
134
|
+
@subsentences << subsentence
|
135
|
+
subsentence = new_subsentence
|
136
|
+
elsif :separ != token.type
|
137
|
+
subsentence.to = token.to
|
138
|
+
end
|
139
|
+
|
140
|
+
subsentence
|
141
|
+
end
|
142
|
+
|
143
|
+
nil.tap { @subsentences << rest if rest.from and rest.to }
|
144
|
+
end
|
145
|
+
|
87
146
|
private
|
88
147
|
# Create a new instance of {Greeb::Entity} with `:sentence` type.
|
89
148
|
#
|
@@ -92,4 +151,12 @@ class Greeb::Segmentator
|
|
92
151
|
def new_sentence
|
93
152
|
Greeb::Entity.new(nil, nil, :sentence)
|
94
153
|
end
|
154
|
+
|
155
|
+
# Create a new instance of {Greeb::Entity} with `:subsentence` type.
|
156
|
+
#
|
157
|
+
# @return [Greeb::Entity] a new entity instance.
|
158
|
+
#
|
159
|
+
def new_subsentence
|
160
|
+
Greeb::Entity.new(nil, nil, :subsentence)
|
161
|
+
end
|
95
162
|
end
|
data/lib/greeb/version.rb
CHANGED
data/spec/segmentator_spec.rb
CHANGED
@@ -89,8 +89,10 @@ module Greeb
|
|
89
89
|
|
90
90
|
subject { Segmentator.new(@tokenizer) }
|
91
91
|
|
92
|
+
let(:sentences) { subject.sentences }
|
93
|
+
|
92
94
|
it 'should be extracted' do
|
93
|
-
subject.extract(*
|
95
|
+
subject.extract(*sentences).must_equal({
|
94
96
|
Entity.new(0, 6, :sentence) => [
|
95
97
|
Entity.new(0, 5, :letter),
|
96
98
|
Entity.new(5, 6, :punct)
|
@@ -108,5 +110,22 @@ module Greeb
|
|
108
110
|
})
|
109
111
|
end
|
110
112
|
end
|
113
|
+
|
114
|
+
describe 'subsentence extractor' do
|
115
|
+
before { @tokenizer = Tokenizer.new('Hello, I am JC Denton.') }
|
116
|
+
|
117
|
+
subject { Segmentator.new(@tokenizer) }
|
118
|
+
|
119
|
+
let(:sentences) { subject.sentences }
|
120
|
+
|
121
|
+
it 'should extract subsentences' do
|
122
|
+
subject.subextract(*sentences).must_equal({
|
123
|
+
Entity.new(0, 22, :sentence) => [
|
124
|
+
Entity.new(0, 6, :subsentence),
|
125
|
+
Entity.new(7, 22, :subsentence)
|
126
|
+
]
|
127
|
+
})
|
128
|
+
end
|
129
|
+
end
|
111
130
|
end
|
112
131
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: greeb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.0.
|
4
|
+
version: 0.1.0.rc4
|
5
5
|
prerelease: 6
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-07-
|
12
|
+
date: 2012-07-20 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
@@ -112,7 +112,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
112
112
|
version: '0'
|
113
113
|
segments:
|
114
114
|
- 0
|
115
|
-
hash:
|
115
|
+
hash: 1130932854600612903
|
116
116
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
117
117
|
none: false
|
118
118
|
requirements:
|