greeb 0.2.0.pre3 → 0.2.0.rc1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 618591e00b61f1df11f98bdd045bd650d34ba863
4
- data.tar.gz: 88d1b8448e98c18e6d9759e4d992d2fbea7c1d63
3
+ metadata.gz: 344f017a6eb1990e716422ce643c41bcfd6a4ae7
4
+ data.tar.gz: 13fff3ec9d8cf11f153fe5bf33e882cae7fbd1ce
5
5
  SHA512:
6
- metadata.gz: e8113e47988e80aabfc07314268a5f8220cce88edbf06bd69b35602623c0a310c3c460e300143943596decae621ee69b4909371b9f43a7d9225bceb336bf21f6
7
- data.tar.gz: 7ebe3c3e0a603bf1fc0072376c3b2b544b43ae38e31e8bc5ff9e34fcaf362b8c474ba565db67363c09f10a2f4960fdb0bf7a165ee6c0b90d657b3914231cc07a
6
+ metadata.gz: bff843912bcafb5be0400ce1b68dba04689be58dcde657e9809e6a4ff50b9c226f058f08b4a9549842d7ea57787abda637d7f7d988ecc06a0d4b7fe99d5695cd
7
+ data.tar.gz: 90c16130b0428e81ea11d25d0d50e2d2786365c024ab859485e19abbaeeb7338e967ddf0226d7b3d5a3bc4e9797d6ed7e841e1c0f334c1872bf43f6e5d0b3973
data/bin/greeb CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  if File.exists? File.expand_path('../../.git', __FILE__)
4
- $:.unshift File.expand_path('../../lib', __FILE__)
4
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
5
5
  end
6
6
 
7
7
  require 'greeb'
@@ -7,7 +7,7 @@ class Greeb::Segmentator
7
7
  # Sentence does not start from the separator charater, line break
8
8
  # character, and punctuation characters.
9
9
  #
10
- SENTENCE_DOESNT_START = [:separ, :break, :punct, :spunct]
10
+ SENTENCE_DOES_NOT_START = [:separ, :break, :punct, :spunct]
11
11
 
12
12
  attr_reader :tokens
13
13
 
@@ -24,8 +24,7 @@ class Greeb::Segmentator
24
24
  # @return [Array<Greeb::Entity>] a set of sentences.
25
25
  #
26
26
  def sentences
27
- detect_sentences! unless @sentences
28
- @sentences
27
+ @sentences ||= detect_entities(new_sentence, [:punct])
29
28
  end
30
29
 
31
30
  # Subsentences memoization method.
@@ -33,8 +32,7 @@ class Greeb::Segmentator
33
32
  # @return [Array<Greeb::Entity>] a set of subsentences.
34
33
  #
35
34
  def subsentences
36
- detect_subsentences! unless @subsentences
37
- @subsentences
35
+ @subsentences ||= detect_entities(new_subsentence, [:punct, :spunct])
38
36
  end
39
37
 
40
38
  # Extract tokens from the set of sentences.
@@ -44,99 +42,75 @@ class Greeb::Segmentator
44
42
  # @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
45
43
  # sentences as keys and tokens arrays as values.
46
44
  #
47
- def extract(sentences)
45
+ def extract(sentences, collection = tokens)
48
46
  Hash[
49
47
  sentences.map do |s|
50
- [s, tokens.select { |t| t.from >= s.from and t.to <= s.to }]
51
- end
52
- ]
53
- end
54
-
55
- # Extract subsentences from the set of sentences.
56
- #
57
- # @param sentences [Array<Greeb::Entity>] a list of sentences.
58
- #
59
- # @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
60
- # sentences as keys and subsentences arrays as values.
61
- #
62
- def subextract(sentences)
63
- Hash[
64
- sentences.map do |s|
65
- [s, subsentences.select { |ss| ss.from >= s.from and ss.to <= s.to }]
48
+ [s, collection.select { |t| t.from >= s.from and t.to <= s.to }]
66
49
  end
67
50
  ]
68
51
  end
69
52
 
70
53
  protected
71
- # Implementation of the sentence detection method. This method
72
- # changes the `@sentences` ivar.
54
+ # Implementation of the entity detection method.
73
55
  #
74
- # @return [nil] nothing.
56
+ # @param sample [Greeb::Entity] a sample of entity to be cloned in the
57
+ # process.
58
+ # @param stop_marks [Array<Symbol>] an array that stores the
59
+ # correspondent stop marks of the necessary entities.
75
60
  #
76
- def detect_sentences!
77
- @sentences = []
78
-
79
- rest = tokens.inject(new_sentence) do |sentence, token|
80
- if !sentence.from and SENTENCE_DOESNT_START.include?(token.type)
81
- next sentence
82
- end
83
-
84
- sentence.from = token.from unless sentence.from
85
-
86
- next sentence if sentence.to and sentence.to > token.to
61
+ # @return [Array<Greeb::Entity>] a set of entites.
62
+ #
63
+ def detect_entities(sample, stop_marks)
64
+ collection = []
87
65
 
88
- if :punct == token.type
89
- sentence.to = tokens.
90
- select { |t| t.from >= token.from }.
91
- inject(token) { |r, t| break r if t.type != token.type; t }.to
66
+ rest = tokens.inject(sample.dup) do |entity, token|
67
+ next entity if sentence_does_not_start? entity, token
68
+ entity.from = token.from unless entity.from
69
+ next entity if entity.to and entity.to > token.to
92
70
 
93
- @sentences << sentence
94
- sentence = new_sentence
71
+ if stop_marks.include? token.type
72
+ entity.to = find_forward(tokens, token).to
73
+ collection << entity
74
+ entity = sample.dup
95
75
  elsif :separ != token.type
96
- sentence.to = token.to
76
+ entity.to = token.to
97
77
  end
98
78
 
99
- sentence
79
+ entity
100
80
  end
101
81
 
102
- nil.tap { @sentences << rest if rest.from && rest.to }
82
+ if rest.from && rest.to
83
+ collection << rest
84
+ else
85
+ collection
86
+ end
103
87
  end
104
88
 
105
- # Implementation of the subsentence detection method. This method
106
- # changes the `@subsentences` ivar.
89
+ private
90
+ # Check the possibility of starting a new sentence by the specified
91
+ # pair of entity and token.
107
92
  #
108
- # @return [nil] nothing.
93
+ # @param entity [Greeb::Entity] an entity to be checked.
94
+ # @param token [Greeb::Entity] an token to be checked.
109
95
  #
110
- def detect_subsentences!
111
- @subsentences = SortedSet.new
112
-
113
- rest = tokens.inject(new_subsentence) do |subsentence, token|
114
- if !subsentence.from && SENTENCE_DOESNT_START.include?(token.type)
115
- next subsentence
116
- end
117
-
118
- subsentence.from = token.from unless subsentence.from
119
-
120
- next subsentence if subsentence.to && subsentence.to > token.to
121
-
122
- if [:punct, :spunct].include? token.type
123
- subsentence.to = tokens.
124
- select { |t| t.from >= token.from }.
125
- inject(token) { |r, t| break r if t.type != token.type; t }.to
126
-
127
- @subsentences << subsentence
128
- subsentence = new_subsentence
129
- elsif :separ != token.type
130
- subsentence.to = token.to
131
- end
132
-
133
- subsentence
134
- end
96
+ # @return true or false.
97
+ #
98
+ def sentence_does_not_start?(entity, token)
99
+ !entity.from and SENTENCE_DOES_NOT_START.include? token.type
100
+ end
135
101
 
136
- nil.tap { @subsentences << rest if rest.from && rest.to }
102
+ # Find a forwarding token that has another type.
103
+ #
104
+ # @param collection [Array<Greeb::Entity>] array of possible tokens.
105
+ # @param sample [Greeb::Entity] a token that is treated as a sample.
106
+ #
107
+ # @return [Greeb::Entity] a forwarding token.
108
+ #
109
+ def find_forward(collection, sample)
110
+ collection.select { |t| t.from >= sample.from }.
111
+ inject(sample) { |r, t| t.type == sample.type ? t : (break r) }
137
112
  end
138
113
 
139
- private
140
114
  # Create a new instance of {Greeb::Entity} with `:sentence` type.
141
115
  #
142
116
  # @return [Greeb::Entity] a new entity instance.
@@ -49,14 +49,7 @@ module Greeb::Tokenizer
49
49
  scanner = Greeb::StringScanner.new(text)
50
50
  tokens = []
51
51
  while !scanner.eos?
52
- parse! scanner, tokens, LETTERS, :letter or
53
- parse! scanner, tokens, FLOATS, :float or
54
- parse! scanner, tokens, INTEGERS, :integer or
55
- split_parse! scanner, tokens, SENTENCE_PUNCTUATIONS, :spunct or
56
- split_parse! scanner, tokens, PUNCTUATIONS, :punct or
57
- split_parse! scanner, tokens, SEPARATORS, :separ or
58
- split_parse! scanner, tokens, BREAKS, :break or
59
- parse! scanner, tokens, RESIDUALS, :residual or
52
+ step scanner, tokens or
60
53
  raise Greeb::UnknownEntity.new(text, scanner.char_pos)
61
54
  end
62
55
  tokens
@@ -64,7 +57,25 @@ module Greeb::Tokenizer
64
57
  scanner.terminate
65
58
  end
66
59
 
67
- private
60
+ protected
61
+ # One iteration of the tokenization process.
62
+ #
63
+ # @param scanner [Greeb::StringScanner] string scanner.
64
+ # @param tokens [Array<Greeb::Entity>] result array.
65
+ #
66
+ # @return [Array<Greeb::Entity>] the modified set of extracted tokens.
67
+ #
68
+ def step scanner, tokens
69
+ parse! scanner, tokens, LETTERS, :letter or
70
+ parse! scanner, tokens, FLOATS, :float or
71
+ parse! scanner, tokens, INTEGERS, :integer or
72
+ split_parse! scanner, tokens, SENTENCE_PUNCTUATIONS, :spunct or
73
+ split_parse! scanner, tokens, PUNCTUATIONS, :punct or
74
+ split_parse! scanner, tokens, SEPARATORS, :separ or
75
+ split_parse! scanner, tokens, BREAKS, :break or
76
+ parse! scanner, tokens, RESIDUALS, :residual
77
+ end
78
+
68
79
  # Try to parse one small piece of text that is covered by pattern
69
80
  # of necessary type.
70
81
  #
@@ -99,9 +110,23 @@ module Greeb::Tokenizer
99
110
  def split_parse! scanner, tokens, pattern, type
100
111
  return false unless token = scanner.scan(pattern)
101
112
  position = scanner.char_pos - token.length
102
- token.scan(/((.|\n)\2*)/).map(&:first).inject(position) do |before, s|
113
+ split(token).inject(position) do |before, s|
103
114
  tokens << Greeb::Entity.new(before, before + s.length, type)
104
115
  before + s.length
105
116
  end
106
117
  end
118
+
119
+ # Split one line into characters array, but also combine line breaks
120
+ # into single elements.
121
+ #
122
+ # For instance, `"a b\n\n\nc"` would be transformed into the following
123
+ # array: `["a", " ", "b", "\n\n\n", "c"]`.
124
+ #
125
+ # @param token [String] a token to be splitted.
126
+ #
127
+ # @return [Array<String>] splitted characters.
128
+ #
129
+ def split(token)
130
+ token.scan(/((.|\n)\2*)/).map(&:first)
131
+ end
107
132
  end
data/lib/greeb/version.rb CHANGED
@@ -5,5 +5,5 @@
5
5
  module Greeb
6
6
  # Version of Greeb.
7
7
  #
8
- VERSION = '0.2.0.pre3'
8
+ VERSION = '0.2.0.rc1'
9
9
  end
@@ -72,8 +72,9 @@ module Greeb
72
72
  describe 'sentence extractor' do
73
73
  let(:tokens) { Tokenizer.tokenize('Hello! I am JC Denton.') }
74
74
  let(:segmentator) { Segmentator.new(tokens) }
75
+ let(:sentences) { segmentator.sentences }
75
76
 
76
- subject { segmentator.extract(segmentator.sentences) }
77
+ subject { segmentator.extract(sentences) }
77
78
 
78
79
  it 'should be extracted' do
79
80
  subject.must_equal(
@@ -98,8 +99,10 @@ module Greeb
98
99
  describe 'subsentence extractor' do
99
100
  let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton.') }
100
101
  let(:segmentator) { Segmentator.new(tokens) }
102
+ let(:sentences) { segmentator.sentences }
103
+ let(:subsentences) { segmentator.subsentences }
101
104
 
102
- subject { segmentator.subextract(segmentator.sentences) }
105
+ subject { segmentator.extract(sentences, subsentences) }
103
106
 
104
107
  it 'should extract subsentences' do
105
108
  subject.must_equal(
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: greeb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0.pre3
4
+ version: 0.2.0.rc1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Ustalov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-04-30 00:00:00.000000000 Z
11
+ date: 2013-05-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -88,7 +88,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
88
88
  version: 1.3.1
89
89
  requirements: []
90
90
  rubyforge_project: greeb
91
- rubygems_version: 2.0.0
91
+ rubygems_version: 2.0.3
92
92
  signing_key:
93
93
  specification_version: 4
94
94
  summary: Greeb is a simple Unicode-aware regexp-based tokenizer.
@@ -99,4 +99,3 @@ test_files:
99
99
  - spec/spec_helper.rb
100
100
  - spec/support/invoker.rb
101
101
  - spec/tokenizer_spec.rb
102
- has_rdoc: