greeb 0.2.0.pre3 → 0.2.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/greeb +1 -1
- data/lib/greeb/segmentator.rb +49 -75
- data/lib/greeb/tokenizer.rb +35 -10
- data/lib/greeb/version.rb +1 -1
- data/spec/segmentator_spec.rb +5 -2
- metadata +3 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 344f017a6eb1990e716422ce643c41bcfd6a4ae7
|
4
|
+
data.tar.gz: 13fff3ec9d8cf11f153fe5bf33e882cae7fbd1ce
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bff843912bcafb5be0400ce1b68dba04689be58dcde657e9809e6a4ff50b9c226f058f08b4a9549842d7ea57787abda637d7f7d988ecc06a0d4b7fe99d5695cd
|
7
|
+
data.tar.gz: 90c16130b0428e81ea11d25d0d50e2d2786365c024ab859485e19abbaeeb7338e967ddf0226d7b3d5a3bc4e9797d6ed7e841e1c0f334c1872bf43f6e5d0b3973
|
data/bin/greeb
CHANGED
data/lib/greeb/segmentator.rb
CHANGED
@@ -7,7 +7,7 @@ class Greeb::Segmentator
|
|
7
7
|
# Sentence does not start from the separator charater, line break
|
8
8
|
# character, and punctuation characters.
|
9
9
|
#
|
10
|
-
|
10
|
+
SENTENCE_DOES_NOT_START = [:separ, :break, :punct, :spunct]
|
11
11
|
|
12
12
|
attr_reader :tokens
|
13
13
|
|
@@ -24,8 +24,7 @@ class Greeb::Segmentator
|
|
24
24
|
# @return [Array<Greeb::Entity>] a set of sentences.
|
25
25
|
#
|
26
26
|
def sentences
|
27
|
-
|
28
|
-
@sentences
|
27
|
+
@sentences ||= detect_entities(new_sentence, [:punct])
|
29
28
|
end
|
30
29
|
|
31
30
|
# Subsentences memoization method.
|
@@ -33,8 +32,7 @@ class Greeb::Segmentator
|
|
33
32
|
# @return [Array<Greeb::Entity>] a set of subsentences.
|
34
33
|
#
|
35
34
|
def subsentences
|
36
|
-
|
37
|
-
@subsentences
|
35
|
+
@subsentences ||= detect_entities(new_subsentence, [:punct, :spunct])
|
38
36
|
end
|
39
37
|
|
40
38
|
# Extract tokens from the set of sentences.
|
@@ -44,99 +42,75 @@ class Greeb::Segmentator
|
|
44
42
|
# @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
|
45
43
|
# sentences as keys and tokens arrays as values.
|
46
44
|
#
|
47
|
-
def extract(sentences)
|
45
|
+
def extract(sentences, collection = tokens)
|
48
46
|
Hash[
|
49
47
|
sentences.map do |s|
|
50
|
-
[s,
|
51
|
-
end
|
52
|
-
]
|
53
|
-
end
|
54
|
-
|
55
|
-
# Extract subsentences from the set of sentences.
|
56
|
-
#
|
57
|
-
# @param sentences [Array<Greeb::Entity>] a list of sentences.
|
58
|
-
#
|
59
|
-
# @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
|
60
|
-
# sentences as keys and subsentences arrays as values.
|
61
|
-
#
|
62
|
-
def subextract(sentences)
|
63
|
-
Hash[
|
64
|
-
sentences.map do |s|
|
65
|
-
[s, subsentences.select { |ss| ss.from >= s.from and ss.to <= s.to }]
|
48
|
+
[s, collection.select { |t| t.from >= s.from and t.to <= s.to }]
|
66
49
|
end
|
67
50
|
]
|
68
51
|
end
|
69
52
|
|
70
53
|
protected
|
71
|
-
# Implementation of the
|
72
|
-
# changes the `@sentences` ivar.
|
54
|
+
# Implementation of the entity detection method.
|
73
55
|
#
|
74
|
-
# @
|
56
|
+
# @param sample [Greeb::Entity] a sample of entity to be cloned in the
|
57
|
+
# process.
|
58
|
+
# @param stop_marks [Array<Symbol>] an array that stores the
|
59
|
+
# correspondent stop marks of the necessary entities.
|
75
60
|
#
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
if !sentence.from and SENTENCE_DOESNT_START.include?(token.type)
|
81
|
-
next sentence
|
82
|
-
end
|
83
|
-
|
84
|
-
sentence.from = token.from unless sentence.from
|
85
|
-
|
86
|
-
next sentence if sentence.to and sentence.to > token.to
|
61
|
+
# @return [Array<Greeb::Entity>] a set of entites.
|
62
|
+
#
|
63
|
+
def detect_entities(sample, stop_marks)
|
64
|
+
collection = []
|
87
65
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
66
|
+
rest = tokens.inject(sample.dup) do |entity, token|
|
67
|
+
next entity if sentence_does_not_start? entity, token
|
68
|
+
entity.from = token.from unless entity.from
|
69
|
+
next entity if entity.to and entity.to > token.to
|
92
70
|
|
93
|
-
|
94
|
-
|
71
|
+
if stop_marks.include? token.type
|
72
|
+
entity.to = find_forward(tokens, token).to
|
73
|
+
collection << entity
|
74
|
+
entity = sample.dup
|
95
75
|
elsif :separ != token.type
|
96
|
-
|
76
|
+
entity.to = token.to
|
97
77
|
end
|
98
78
|
|
99
|
-
|
79
|
+
entity
|
100
80
|
end
|
101
81
|
|
102
|
-
|
82
|
+
if rest.from && rest.to
|
83
|
+
collection << rest
|
84
|
+
else
|
85
|
+
collection
|
86
|
+
end
|
103
87
|
end
|
104
88
|
|
105
|
-
|
106
|
-
#
|
89
|
+
private
|
90
|
+
# Check the possibility of starting a new sentence by the specified
|
91
|
+
# pair of entity and token.
|
107
92
|
#
|
108
|
-
# @
|
93
|
+
# @param entity [Greeb::Entity] an entity to be checked.
|
94
|
+
# @param token [Greeb::Entity] an token to be checked.
|
109
95
|
#
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
next subsentence
|
116
|
-
end
|
117
|
-
|
118
|
-
subsentence.from = token.from unless subsentence.from
|
119
|
-
|
120
|
-
next subsentence if subsentence.to && subsentence.to > token.to
|
121
|
-
|
122
|
-
if [:punct, :spunct].include? token.type
|
123
|
-
subsentence.to = tokens.
|
124
|
-
select { |t| t.from >= token.from }.
|
125
|
-
inject(token) { |r, t| break r if t.type != token.type; t }.to
|
126
|
-
|
127
|
-
@subsentences << subsentence
|
128
|
-
subsentence = new_subsentence
|
129
|
-
elsif :separ != token.type
|
130
|
-
subsentence.to = token.to
|
131
|
-
end
|
132
|
-
|
133
|
-
subsentence
|
134
|
-
end
|
96
|
+
# @return true or false.
|
97
|
+
#
|
98
|
+
def sentence_does_not_start?(entity, token)
|
99
|
+
!entity.from and SENTENCE_DOES_NOT_START.include? token.type
|
100
|
+
end
|
135
101
|
|
136
|
-
|
102
|
+
# Find a forwarding token that has another type.
|
103
|
+
#
|
104
|
+
# @param collection [Array<Greeb::Entity>] array of possible tokens.
|
105
|
+
# @param sample [Greeb::Entity] a token that is treated as a sample.
|
106
|
+
#
|
107
|
+
# @return [Greeb::Entity] a forwarding token.
|
108
|
+
#
|
109
|
+
def find_forward(collection, sample)
|
110
|
+
collection.select { |t| t.from >= sample.from }.
|
111
|
+
inject(sample) { |r, t| t.type == sample.type ? t : (break r) }
|
137
112
|
end
|
138
113
|
|
139
|
-
private
|
140
114
|
# Create a new instance of {Greeb::Entity} with `:sentence` type.
|
141
115
|
#
|
142
116
|
# @return [Greeb::Entity] a new entity instance.
|
data/lib/greeb/tokenizer.rb
CHANGED
@@ -49,14 +49,7 @@ module Greeb::Tokenizer
|
|
49
49
|
scanner = Greeb::StringScanner.new(text)
|
50
50
|
tokens = []
|
51
51
|
while !scanner.eos?
|
52
|
-
|
53
|
-
parse! scanner, tokens, FLOATS, :float or
|
54
|
-
parse! scanner, tokens, INTEGERS, :integer or
|
55
|
-
split_parse! scanner, tokens, SENTENCE_PUNCTUATIONS, :spunct or
|
56
|
-
split_parse! scanner, tokens, PUNCTUATIONS, :punct or
|
57
|
-
split_parse! scanner, tokens, SEPARATORS, :separ or
|
58
|
-
split_parse! scanner, tokens, BREAKS, :break or
|
59
|
-
parse! scanner, tokens, RESIDUALS, :residual or
|
52
|
+
step scanner, tokens or
|
60
53
|
raise Greeb::UnknownEntity.new(text, scanner.char_pos)
|
61
54
|
end
|
62
55
|
tokens
|
@@ -64,7 +57,25 @@ module Greeb::Tokenizer
|
|
64
57
|
scanner.terminate
|
65
58
|
end
|
66
59
|
|
67
|
-
|
60
|
+
protected
|
61
|
+
# One iteration of the tokenization process.
|
62
|
+
#
|
63
|
+
# @param scanner [Greeb::StringScanner] string scanner.
|
64
|
+
# @param tokens [Array<Greeb::Entity>] result array.
|
65
|
+
#
|
66
|
+
# @return [Array<Greeb::Entity>] the modified set of extracted tokens.
|
67
|
+
#
|
68
|
+
def step scanner, tokens
|
69
|
+
parse! scanner, tokens, LETTERS, :letter or
|
70
|
+
parse! scanner, tokens, FLOATS, :float or
|
71
|
+
parse! scanner, tokens, INTEGERS, :integer or
|
72
|
+
split_parse! scanner, tokens, SENTENCE_PUNCTUATIONS, :spunct or
|
73
|
+
split_parse! scanner, tokens, PUNCTUATIONS, :punct or
|
74
|
+
split_parse! scanner, tokens, SEPARATORS, :separ or
|
75
|
+
split_parse! scanner, tokens, BREAKS, :break or
|
76
|
+
parse! scanner, tokens, RESIDUALS, :residual
|
77
|
+
end
|
78
|
+
|
68
79
|
# Try to parse one small piece of text that is covered by pattern
|
69
80
|
# of necessary type.
|
70
81
|
#
|
@@ -99,9 +110,23 @@ module Greeb::Tokenizer
|
|
99
110
|
def split_parse! scanner, tokens, pattern, type
|
100
111
|
return false unless token = scanner.scan(pattern)
|
101
112
|
position = scanner.char_pos - token.length
|
102
|
-
token
|
113
|
+
split(token).inject(position) do |before, s|
|
103
114
|
tokens << Greeb::Entity.new(before, before + s.length, type)
|
104
115
|
before + s.length
|
105
116
|
end
|
106
117
|
end
|
118
|
+
|
119
|
+
# Split one line into characters array, but also combine line breaks
|
120
|
+
# into single elements.
|
121
|
+
#
|
122
|
+
# For instance, `"a b\n\n\nc"` would be transformed into the following
|
123
|
+
# array: `["a", " ", "b", "\n\n\n", "c"]`.
|
124
|
+
#
|
125
|
+
# @param token [String] a token to be splitted.
|
126
|
+
#
|
127
|
+
# @return [Array<String>] splitted characters.
|
128
|
+
#
|
129
|
+
def split(token)
|
130
|
+
token.scan(/((.|\n)\2*)/).map(&:first)
|
131
|
+
end
|
107
132
|
end
|
data/lib/greeb/version.rb
CHANGED
data/spec/segmentator_spec.rb
CHANGED
@@ -72,8 +72,9 @@ module Greeb
|
|
72
72
|
describe 'sentence extractor' do
|
73
73
|
let(:tokens) { Tokenizer.tokenize('Hello! I am JC Denton.') }
|
74
74
|
let(:segmentator) { Segmentator.new(tokens) }
|
75
|
+
let(:sentences) { segmentator.sentences }
|
75
76
|
|
76
|
-
subject { segmentator.extract(
|
77
|
+
subject { segmentator.extract(sentences) }
|
77
78
|
|
78
79
|
it 'should be extracted' do
|
79
80
|
subject.must_equal(
|
@@ -98,8 +99,10 @@ module Greeb
|
|
98
99
|
describe 'subsentence extractor' do
|
99
100
|
let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton.') }
|
100
101
|
let(:segmentator) { Segmentator.new(tokens) }
|
102
|
+
let(:sentences) { segmentator.sentences }
|
103
|
+
let(:subsentences) { segmentator.subsentences }
|
101
104
|
|
102
|
-
subject { segmentator.
|
105
|
+
subject { segmentator.extract(sentences, subsentences) }
|
103
106
|
|
104
107
|
it 'should extract subsentences' do
|
105
108
|
subject.must_equal(
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: greeb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.0.
|
4
|
+
version: 0.2.0.rc1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Ustalov
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-05-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -88,7 +88,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
88
88
|
version: 1.3.1
|
89
89
|
requirements: []
|
90
90
|
rubyforge_project: greeb
|
91
|
-
rubygems_version: 2.0.
|
91
|
+
rubygems_version: 2.0.3
|
92
92
|
signing_key:
|
93
93
|
specification_version: 4
|
94
94
|
summary: Greeb is a simple Unicode-aware regexp-based tokenizer.
|
@@ -99,4 +99,3 @@ test_files:
|
|
99
99
|
- spec/spec_helper.rb
|
100
100
|
- spec/support/invoker.rb
|
101
101
|
- spec/tokenizer_spec.rb
|
102
|
-
has_rdoc:
|