greeb 0.1.0.rc3 → 0.1.0.rc4

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -132,6 +132,11 @@ systematic and awesome.
132
132
 
133
133
  ## Build Status [<img src="https://secure.travis-ci.org/eveel/greeb.png"/>](http://travis-ci.org/eveel/greeb)
134
134
 
135
+ If you're using [Rubinius](http://rubini.us) please note that it has the
136
+ incompatible `StringScanner` implementation. More information can be
137
+ provided under the following link:
138
+ <https://github.com/rubinius/rubinius/issues/1808>.
139
+
135
140
  ## Dependency Status [<img src="https://gemnasium.com/eveel/greeb.png?travis"/>](https://gemnasium.com/eveel/greeb)
136
141
 
137
142
  ## Copyright
@@ -33,6 +33,15 @@ class Greeb::Segmentator
33
33
  @sentences
34
34
  end
35
35
 
36
+ # Subsentences memoization method.
37
+ #
38
+ # @return [Set<Greeb::Entity>] a set of subsentences.
39
+ #
40
+ def subsentences
41
+ detect_subsentences! unless @subsentences
42
+ @subsentences
43
+ end
44
+
36
45
  # Extract tokens from the set of sentences.
37
46
  #
38
47
  # @param sentences [Array<Greeb::Entity>] a list of sentences.
@@ -48,6 +57,21 @@ class Greeb::Segmentator
48
57
  ]
49
58
  end
50
59
 
60
+ # Extract subsentences from the set of sentences.
61
+ #
62
+ # @param sentences [Array<Greeb::Entity>] a list of sentences.
63
+ #
64
+ # @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
65
+ # sentences as keys and subsentences arrays as values.
66
+ #
67
+ def subextract *sentences
68
+ Hash[
69
+ sentences.map do |s|
70
+ [s, subsentences.select { |ss| ss.from >= s.from and ss.to <= s.to }]
71
+ end
72
+ ]
73
+ end
74
+
51
75
  protected
52
76
  # Implementation of the sentence detection method. This method
53
77
  # changes the `@sentences` ivar.
@@ -84,6 +108,41 @@ class Greeb::Segmentator
84
108
  nil.tap { @sentences << rest if rest.from and rest.to }
85
109
  end
86
110
 
111
+ # Implementation of the subsentence detection method. This method
112
+ # changes the `@subsentences` ivar.
113
+ #
114
+ # @return [nil] nothing.
115
+ #
116
+ def detect_subsentences!
117
+ @subsentences = SortedSet.new
118
+
119
+ rest = tokens.inject(new_subsentence) do |subsentence, token|
120
+ if !subsentence.from and SENTENCE_DOESNT_START.include?(token.type)
121
+ next subsentence
122
+ end
123
+
124
+ subsentence.from = token.from unless subsentence.from
125
+
126
+ next subsentence if subsentence.to and subsentence.to > token.to
127
+
128
+ if [:punct, :spunct].include? token.type
129
+ subsentence.to = tokens.
130
+ select { |t| t.from >= token.from }.
131
+ inject(token) { |r, t| break r if t.type != token.type; t }.
132
+ to
133
+
134
+ @subsentences << subsentence
135
+ subsentence = new_subsentence
136
+ elsif :separ != token.type
137
+ subsentence.to = token.to
138
+ end
139
+
140
+ subsentence
141
+ end
142
+
143
+ nil.tap { @subsentences << rest if rest.from and rest.to }
144
+ end
145
+
87
146
  private
88
147
  # Create a new instance of {Greeb::Entity} with `:sentence` type.
89
148
  #
@@ -92,4 +151,12 @@ class Greeb::Segmentator
92
151
  def new_sentence
93
152
  Greeb::Entity.new(nil, nil, :sentence)
94
153
  end
154
+
155
+ # Create a new instance of {Greeb::Entity} with `:subsentence` type.
156
+ #
157
+ # @return [Greeb::Entity] a new entity instance.
158
+ #
159
+ def new_subsentence
160
+ Greeb::Entity.new(nil, nil, :subsentence)
161
+ end
95
162
  end
data/lib/greeb/version.rb CHANGED
@@ -5,5 +5,5 @@
5
5
  module Greeb
6
6
  # Version of Greeb.
7
7
  #
8
- VERSION = '0.1.0.rc3'
8
+ VERSION = '0.1.0.rc4'
9
9
  end
@@ -89,8 +89,10 @@ module Greeb
89
89
 
90
90
  subject { Segmentator.new(@tokenizer) }
91
91
 
92
+ let(:sentences) { subject.sentences }
93
+
92
94
  it 'should be extracted' do
93
- subject.extract(*subject.sentences).must_equal({
95
+ subject.extract(*sentences).must_equal({
94
96
  Entity.new(0, 6, :sentence) => [
95
97
  Entity.new(0, 5, :letter),
96
98
  Entity.new(5, 6, :punct)
@@ -108,5 +110,22 @@ module Greeb
108
110
  })
109
111
  end
110
112
  end
113
+
114
+ describe 'subsentence extractor' do
115
+ before { @tokenizer = Tokenizer.new('Hello, I am JC Denton.') }
116
+
117
+ subject { Segmentator.new(@tokenizer) }
118
+
119
+ let(:sentences) { subject.sentences }
120
+
121
+ it 'should extract subsentences' do
122
+ subject.subextract(*sentences).must_equal({
123
+ Entity.new(0, 22, :sentence) => [
124
+ Entity.new(0, 6, :subsentence),
125
+ Entity.new(7, 22, :subsentence)
126
+ ]
127
+ })
128
+ end
129
+ end
111
130
  end
112
131
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: greeb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0.rc3
4
+ version: 0.1.0.rc4
5
5
  prerelease: 6
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-10 00:00:00.000000000 Z
12
+ date: 2012-07-20 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
@@ -112,7 +112,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
112
112
  version: '0'
113
113
  segments:
114
114
  - 0
115
- hash: -2527935574265859361
115
+ hash: 1130932854600612903
116
116
  required_rubygems_version: !ruby/object:Gem::Requirement
117
117
  none: false
118
118
  requirements: