greeb 0.2.0.pre3 → 0.2.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 618591e00b61f1df11f98bdd045bd650d34ba863
4
- data.tar.gz: 88d1b8448e98c18e6d9759e4d992d2fbea7c1d63
3
+ metadata.gz: 344f017a6eb1990e716422ce643c41bcfd6a4ae7
4
+ data.tar.gz: 13fff3ec9d8cf11f153fe5bf33e882cae7fbd1ce
5
5
  SHA512:
6
- metadata.gz: e8113e47988e80aabfc07314268a5f8220cce88edbf06bd69b35602623c0a310c3c460e300143943596decae621ee69b4909371b9f43a7d9225bceb336bf21f6
7
- data.tar.gz: 7ebe3c3e0a603bf1fc0072376c3b2b544b43ae38e31e8bc5ff9e34fcaf362b8c474ba565db67363c09f10a2f4960fdb0bf7a165ee6c0b90d657b3914231cc07a
6
+ metadata.gz: bff843912bcafb5be0400ce1b68dba04689be58dcde657e9809e6a4ff50b9c226f058f08b4a9549842d7ea57787abda637d7f7d988ecc06a0d4b7fe99d5695cd
7
+ data.tar.gz: 90c16130b0428e81ea11d25d0d50e2d2786365c024ab859485e19abbaeeb7338e967ddf0226d7b3d5a3bc4e9797d6ed7e841e1c0f334c1872bf43f6e5d0b3973
data/bin/greeb CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  if File.exists? File.expand_path('../../.git', __FILE__)
4
- $:.unshift File.expand_path('../../lib', __FILE__)
4
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
5
5
  end
6
6
 
7
7
  require 'greeb'
@@ -7,7 +7,7 @@ class Greeb::Segmentator
7
7
  # Sentence does not start from the separator charater, line break
8
8
  # character, and punctuation characters.
9
9
  #
10
- SENTENCE_DOESNT_START = [:separ, :break, :punct, :spunct]
10
+ SENTENCE_DOES_NOT_START = [:separ, :break, :punct, :spunct]
11
11
 
12
12
  attr_reader :tokens
13
13
 
@@ -24,8 +24,7 @@ class Greeb::Segmentator
24
24
  # @return [Array<Greeb::Entity>] a set of sentences.
25
25
  #
26
26
  def sentences
27
- detect_sentences! unless @sentences
28
- @sentences
27
+ @sentences ||= detect_entities(new_sentence, [:punct])
29
28
  end
30
29
 
31
30
  # Subsentences memoization method.
@@ -33,8 +32,7 @@ class Greeb::Segmentator
33
32
  # @return [Array<Greeb::Entity>] a set of subsentences.
34
33
  #
35
34
  def subsentences
36
- detect_subsentences! unless @subsentences
37
- @subsentences
35
+ @subsentences ||= detect_entities(new_subsentence, [:punct, :spunct])
38
36
  end
39
37
 
40
38
  # Extract tokens from the set of sentences.
@@ -44,99 +42,75 @@ class Greeb::Segmentator
44
42
  # @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
45
43
  # sentences as keys and tokens arrays as values.
46
44
  #
47
- def extract(sentences)
45
+ def extract(sentences, collection = tokens)
48
46
  Hash[
49
47
  sentences.map do |s|
50
- [s, tokens.select { |t| t.from >= s.from and t.to <= s.to }]
51
- end
52
- ]
53
- end
54
-
55
- # Extract subsentences from the set of sentences.
56
- #
57
- # @param sentences [Array<Greeb::Entity>] a list of sentences.
58
- #
59
- # @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
60
- # sentences as keys and subsentences arrays as values.
61
- #
62
- def subextract(sentences)
63
- Hash[
64
- sentences.map do |s|
65
- [s, subsentences.select { |ss| ss.from >= s.from and ss.to <= s.to }]
48
+ [s, collection.select { |t| t.from >= s.from and t.to <= s.to }]
66
49
  end
67
50
  ]
68
51
  end
69
52
 
70
53
  protected
71
- # Implementation of the sentence detection method. This method
72
- # changes the `@sentences` ivar.
54
+ # Implementation of the entity detection method.
73
55
  #
74
- # @return [nil] nothing.
56
+ # @param sample [Greeb::Entity] a sample of entity to be cloned in the
57
+ # process.
58
+ # @param stop_marks [Array<Symbol>] an array that stores the
59
+ # correspondent stop marks of the necessary entities.
75
60
  #
76
- def detect_sentences!
77
- @sentences = []
78
-
79
- rest = tokens.inject(new_sentence) do |sentence, token|
80
- if !sentence.from and SENTENCE_DOESNT_START.include?(token.type)
81
- next sentence
82
- end
83
-
84
- sentence.from = token.from unless sentence.from
85
-
86
- next sentence if sentence.to and sentence.to > token.to
61
+ # @return [Array<Greeb::Entity>] a set of entites.
62
+ #
63
+ def detect_entities(sample, stop_marks)
64
+ collection = []
87
65
 
88
- if :punct == token.type
89
- sentence.to = tokens.
90
- select { |t| t.from >= token.from }.
91
- inject(token) { |r, t| break r if t.type != token.type; t }.to
66
+ rest = tokens.inject(sample.dup) do |entity, token|
67
+ next entity if sentence_does_not_start? entity, token
68
+ entity.from = token.from unless entity.from
69
+ next entity if entity.to and entity.to > token.to
92
70
 
93
- @sentences << sentence
94
- sentence = new_sentence
71
+ if stop_marks.include? token.type
72
+ entity.to = find_forward(tokens, token).to
73
+ collection << entity
74
+ entity = sample.dup
95
75
  elsif :separ != token.type
96
- sentence.to = token.to
76
+ entity.to = token.to
97
77
  end
98
78
 
99
- sentence
79
+ entity
100
80
  end
101
81
 
102
- nil.tap { @sentences << rest if rest.from && rest.to }
82
+ if rest.from && rest.to
83
+ collection << rest
84
+ else
85
+ collection
86
+ end
103
87
  end
104
88
 
105
- # Implementation of the subsentence detection method. This method
106
- # changes the `@subsentences` ivar.
89
+ private
90
+ # Check the possibility of starting a new sentence by the specified
91
+ # pair of entity and token.
107
92
  #
108
- # @return [nil] nothing.
93
+ # @param entity [Greeb::Entity] an entity to be checked.
94
+ # @param token [Greeb::Entity] an token to be checked.
109
95
  #
110
- def detect_subsentences!
111
- @subsentences = SortedSet.new
112
-
113
- rest = tokens.inject(new_subsentence) do |subsentence, token|
114
- if !subsentence.from && SENTENCE_DOESNT_START.include?(token.type)
115
- next subsentence
116
- end
117
-
118
- subsentence.from = token.from unless subsentence.from
119
-
120
- next subsentence if subsentence.to && subsentence.to > token.to
121
-
122
- if [:punct, :spunct].include? token.type
123
- subsentence.to = tokens.
124
- select { |t| t.from >= token.from }.
125
- inject(token) { |r, t| break r if t.type != token.type; t }.to
126
-
127
- @subsentences << subsentence
128
- subsentence = new_subsentence
129
- elsif :separ != token.type
130
- subsentence.to = token.to
131
- end
132
-
133
- subsentence
134
- end
96
+ # @return true or false.
97
+ #
98
+ def sentence_does_not_start?(entity, token)
99
+ !entity.from and SENTENCE_DOES_NOT_START.include? token.type
100
+ end
135
101
 
136
- nil.tap { @subsentences << rest if rest.from && rest.to }
102
+ # Find a forwarding token that has another type.
103
+ #
104
+ # @param collection [Array<Greeb::Entity>] array of possible tokens.
105
+ # @param sample [Greeb::Entity] a token that is treated as a sample.
106
+ #
107
+ # @return [Greeb::Entity] a forwarding token.
108
+ #
109
+ def find_forward(collection, sample)
110
+ collection.select { |t| t.from >= sample.from }.
111
+ inject(sample) { |r, t| t.type == sample.type ? t : (break r) }
137
112
  end
138
113
 
139
- private
140
114
  # Create a new instance of {Greeb::Entity} with `:sentence` type.
141
115
  #
142
116
  # @return [Greeb::Entity] a new entity instance.
@@ -49,14 +49,7 @@ module Greeb::Tokenizer
49
49
  scanner = Greeb::StringScanner.new(text)
50
50
  tokens = []
51
51
  while !scanner.eos?
52
- parse! scanner, tokens, LETTERS, :letter or
53
- parse! scanner, tokens, FLOATS, :float or
54
- parse! scanner, tokens, INTEGERS, :integer or
55
- split_parse! scanner, tokens, SENTENCE_PUNCTUATIONS, :spunct or
56
- split_parse! scanner, tokens, PUNCTUATIONS, :punct or
57
- split_parse! scanner, tokens, SEPARATORS, :separ or
58
- split_parse! scanner, tokens, BREAKS, :break or
59
- parse! scanner, tokens, RESIDUALS, :residual or
52
+ step scanner, tokens or
60
53
  raise Greeb::UnknownEntity.new(text, scanner.char_pos)
61
54
  end
62
55
  tokens
@@ -64,7 +57,25 @@ module Greeb::Tokenizer
64
57
  scanner.terminate
65
58
  end
66
59
 
67
- private
60
+ protected
61
+ # One iteration of the tokenization process.
62
+ #
63
+ # @param scanner [Greeb::StringScanner] string scanner.
64
+ # @param tokens [Array<Greeb::Entity>] result array.
65
+ #
66
+ # @return [Array<Greeb::Entity>] the modified set of extracted tokens.
67
+ #
68
+ def step scanner, tokens
69
+ parse! scanner, tokens, LETTERS, :letter or
70
+ parse! scanner, tokens, FLOATS, :float or
71
+ parse! scanner, tokens, INTEGERS, :integer or
72
+ split_parse! scanner, tokens, SENTENCE_PUNCTUATIONS, :spunct or
73
+ split_parse! scanner, tokens, PUNCTUATIONS, :punct or
74
+ split_parse! scanner, tokens, SEPARATORS, :separ or
75
+ split_parse! scanner, tokens, BREAKS, :break or
76
+ parse! scanner, tokens, RESIDUALS, :residual
77
+ end
78
+
68
79
  # Try to parse one small piece of text that is covered by pattern
69
80
  # of necessary type.
70
81
  #
@@ -99,9 +110,23 @@ module Greeb::Tokenizer
99
110
  def split_parse! scanner, tokens, pattern, type
100
111
  return false unless token = scanner.scan(pattern)
101
112
  position = scanner.char_pos - token.length
102
- token.scan(/((.|\n)\2*)/).map(&:first).inject(position) do |before, s|
113
+ split(token).inject(position) do |before, s|
103
114
  tokens << Greeb::Entity.new(before, before + s.length, type)
104
115
  before + s.length
105
116
  end
106
117
  end
118
+
119
+ # Split one line into characters array, but also combine line breaks
120
+ # into single elements.
121
+ #
122
+ # For instance, `"a b\n\n\nc"` would be transformed into the following
123
+ # array: `["a", " ", "b", "\n\n\n", "c"]`.
124
+ #
125
+ # @param token [String] a token to be splitted.
126
+ #
127
+ # @return [Array<String>] splitted characters.
128
+ #
129
+ def split(token)
130
+ token.scan(/((.|\n)\2*)/).map(&:first)
131
+ end
107
132
  end
data/lib/greeb/version.rb CHANGED
@@ -5,5 +5,5 @@
5
5
  module Greeb
6
6
  # Version of Greeb.
7
7
  #
8
- VERSION = '0.2.0.pre3'
8
+ VERSION = '0.2.0.rc1'
9
9
  end
@@ -72,8 +72,9 @@ module Greeb
72
72
  describe 'sentence extractor' do
73
73
  let(:tokens) { Tokenizer.tokenize('Hello! I am JC Denton.') }
74
74
  let(:segmentator) { Segmentator.new(tokens) }
75
+ let(:sentences) { segmentator.sentences }
75
76
 
76
- subject { segmentator.extract(segmentator.sentences) }
77
+ subject { segmentator.extract(sentences) }
77
78
 
78
79
  it 'should be extracted' do
79
80
  subject.must_equal(
@@ -98,8 +99,10 @@ module Greeb
98
99
  describe 'subsentence extractor' do
99
100
  let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton.') }
100
101
  let(:segmentator) { Segmentator.new(tokens) }
102
+ let(:sentences) { segmentator.sentences }
103
+ let(:subsentences) { segmentator.subsentences }
101
104
 
102
- subject { segmentator.subextract(segmentator.sentences) }
105
+ subject { segmentator.extract(sentences, subsentences) }
103
106
 
104
107
  it 'should extract subsentences' do
105
108
  subject.must_equal(
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: greeb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0.pre3
4
+ version: 0.2.0.rc1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Ustalov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-04-30 00:00:00.000000000 Z
11
+ date: 2013-05-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -88,7 +88,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
88
88
  version: 1.3.1
89
89
  requirements: []
90
90
  rubyforge_project: greeb
91
- rubygems_version: 2.0.0
91
+ rubygems_version: 2.0.3
92
92
  signing_key:
93
93
  specification_version: 4
94
94
  summary: Greeb is a simple Unicode-aware regexp-based tokenizer.
@@ -99,4 +99,3 @@ test_files:
99
99
  - spec/spec_helper.rb
100
100
  - spec/support/invoker.rb
101
101
  - spec/tokenizer_spec.rb
102
- has_rdoc: