greeb 0.1.0.rc3 → 0.1.0.rc4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -132,6 +132,11 @@ systematic and awesome.
132
132
 
133
133
  ## Build Status [<img src="https://secure.travis-ci.org/eveel/greeb.png"/>](http://travis-ci.org/eveel/greeb)
134
134
 
135
+ If you're using [Rubinius](http://rubini.us) please note that it has the
136
+ incompatible `StringScanner` implementation. More information can be
137
+ provided under the following link:
138
+ <https://github.com/rubinius/rubinius/issues/1808>.
139
+
135
140
  ## Dependency Status [<img src="https://gemnasium.com/eveel/greeb.png?travis"/>](https://gemnasium.com/eveel/greeb)
136
141
 
137
142
  ## Copyright
@@ -33,6 +33,15 @@ class Greeb::Segmentator
33
33
  @sentences
34
34
  end
35
35
 
36
+ # Subsentences memoization method.
37
+ #
38
+ # @return [Set<Greeb::Entity>] a set of subsentences.
39
+ #
40
+ def subsentences
41
+ detect_subsentences! unless @subsentences
42
+ @subsentences
43
+ end
44
+
36
45
  # Extract tokens from the set of sentences.
37
46
  #
38
47
  # @param sentences [Array<Greeb::Entity>] a list of sentences.
@@ -48,6 +57,21 @@ class Greeb::Segmentator
48
57
  ]
49
58
  end
50
59
 
60
+ # Extract subsentences from the set of sentences.
61
+ #
62
+ # @param sentences [Array<Greeb::Entity>] a list of sentences.
63
+ #
64
+ # @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
65
+ # sentences as keys and subsentences arrays as values.
66
+ #
67
+ def subextract *sentences
68
+ Hash[
69
+ sentences.map do |s|
70
+ [s, subsentences.select { |ss| ss.from >= s.from and ss.to <= s.to }]
71
+ end
72
+ ]
73
+ end
74
+
51
75
  protected
52
76
  # Implementation of the sentence detection method. This method
53
77
  # changes the `@sentences` ivar.
@@ -84,6 +108,41 @@ class Greeb::Segmentator
84
108
  nil.tap { @sentences << rest if rest.from and rest.to }
85
109
  end
86
110
 
111
+ # Implementation of the subsentence detection method. This method
112
+ # changes the `@subsentences` ivar.
113
+ #
114
+ # @return [nil] nothing.
115
+ #
116
+ def detect_subsentences!
117
+ @subsentences = SortedSet.new
118
+
119
+ rest = tokens.inject(new_subsentence) do |subsentence, token|
120
+ if !subsentence.from and SENTENCE_DOESNT_START.include?(token.type)
121
+ next subsentence
122
+ end
123
+
124
+ subsentence.from = token.from unless subsentence.from
125
+
126
+ next subsentence if subsentence.to and subsentence.to > token.to
127
+
128
+ if [:punct, :spunct].include? token.type
129
+ subsentence.to = tokens.
130
+ select { |t| t.from >= token.from }.
131
+ inject(token) { |r, t| break r if t.type != token.type; t }.
132
+ to
133
+
134
+ @subsentences << subsentence
135
+ subsentence = new_subsentence
136
+ elsif :separ != token.type
137
+ subsentence.to = token.to
138
+ end
139
+
140
+ subsentence
141
+ end
142
+
143
+ nil.tap { @subsentences << rest if rest.from and rest.to }
144
+ end
145
+
87
146
  private
88
147
  # Create a new instance of {Greeb::Entity} with `:sentence` type.
89
148
  #
@@ -92,4 +151,12 @@ class Greeb::Segmentator
92
151
  def new_sentence
93
152
  Greeb::Entity.new(nil, nil, :sentence)
94
153
  end
154
+
155
+ # Create a new instance of {Greeb::Entity} with `:subsentence` type.
156
+ #
157
+ # @return [Greeb::Entity] a new entity instance.
158
+ #
159
+ def new_subsentence
160
+ Greeb::Entity.new(nil, nil, :subsentence)
161
+ end
95
162
  end
data/lib/greeb/version.rb CHANGED
@@ -5,5 +5,5 @@
5
5
  module Greeb
6
6
  # Version of Greeb.
7
7
  #
8
- VERSION = '0.1.0.rc3'
8
+ VERSION = '0.1.0.rc4'
9
9
  end
@@ -89,8 +89,10 @@ module Greeb
89
89
 
90
90
  subject { Segmentator.new(@tokenizer) }
91
91
 
92
+ let(:sentences) { subject.sentences }
93
+
92
94
  it 'should be extracted' do
93
- subject.extract(*subject.sentences).must_equal({
95
+ subject.extract(*sentences).must_equal({
94
96
  Entity.new(0, 6, :sentence) => [
95
97
  Entity.new(0, 5, :letter),
96
98
  Entity.new(5, 6, :punct)
@@ -108,5 +110,22 @@ module Greeb
108
110
  })
109
111
  end
110
112
  end
113
+
114
+ describe 'subsentence extractor' do
115
+ before { @tokenizer = Tokenizer.new('Hello, I am JC Denton.') }
116
+
117
+ subject { Segmentator.new(@tokenizer) }
118
+
119
+ let(:sentences) { subject.sentences }
120
+
121
+ it 'should extract subsentences' do
122
+ subject.subextract(*sentences).must_equal({
123
+ Entity.new(0, 22, :sentence) => [
124
+ Entity.new(0, 6, :subsentence),
125
+ Entity.new(7, 22, :subsentence)
126
+ ]
127
+ })
128
+ end
129
+ end
111
130
  end
112
131
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: greeb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0.rc3
4
+ version: 0.1.0.rc4
5
5
  prerelease: 6
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-10 00:00:00.000000000 Z
12
+ date: 2012-07-20 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
@@ -112,7 +112,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
112
112
  version: '0'
113
113
  segments:
114
114
  - 0
115
- hash: -2527935574265859361
115
+ hash: 1130932854600612903
116
116
  required_rubygems_version: !ruby/object:Gem::Requirement
117
117
  none: false
118
118
  requirements: