greeb 0.2.0.pre3 → 0.2.0.rc1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/greeb +1 -1
- data/lib/greeb/segmentator.rb +49 -75
- data/lib/greeb/tokenizer.rb +35 -10
- data/lib/greeb/version.rb +1 -1
- data/spec/segmentator_spec.rb +5 -2
- metadata +3 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 344f017a6eb1990e716422ce643c41bcfd6a4ae7
|
4
|
+
data.tar.gz: 13fff3ec9d8cf11f153fe5bf33e882cae7fbd1ce
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bff843912bcafb5be0400ce1b68dba04689be58dcde657e9809e6a4ff50b9c226f058f08b4a9549842d7ea57787abda637d7f7d988ecc06a0d4b7fe99d5695cd
|
7
|
+
data.tar.gz: 90c16130b0428e81ea11d25d0d50e2d2786365c024ab859485e19abbaeeb7338e967ddf0226d7b3d5a3bc4e9797d6ed7e841e1c0f334c1872bf43f6e5d0b3973
|
data/bin/greeb
CHANGED
data/lib/greeb/segmentator.rb
CHANGED
@@ -7,7 +7,7 @@ class Greeb::Segmentator
|
|
7
7
|
# Sentence does not start from the separator charater, line break
|
8
8
|
# character, and punctuation characters.
|
9
9
|
#
|
10
|
-
|
10
|
+
SENTENCE_DOES_NOT_START = [:separ, :break, :punct, :spunct]
|
11
11
|
|
12
12
|
attr_reader :tokens
|
13
13
|
|
@@ -24,8 +24,7 @@ class Greeb::Segmentator
|
|
24
24
|
# @return [Array<Greeb::Entity>] a set of sentences.
|
25
25
|
#
|
26
26
|
def sentences
|
27
|
-
|
28
|
-
@sentences
|
27
|
+
@sentences ||= detect_entities(new_sentence, [:punct])
|
29
28
|
end
|
30
29
|
|
31
30
|
# Subsentences memoization method.
|
@@ -33,8 +32,7 @@ class Greeb::Segmentator
|
|
33
32
|
# @return [Array<Greeb::Entity>] a set of subsentences.
|
34
33
|
#
|
35
34
|
def subsentences
|
36
|
-
|
37
|
-
@subsentences
|
35
|
+
@subsentences ||= detect_entities(new_subsentence, [:punct, :spunct])
|
38
36
|
end
|
39
37
|
|
40
38
|
# Extract tokens from the set of sentences.
|
@@ -44,99 +42,75 @@ class Greeb::Segmentator
|
|
44
42
|
# @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
|
45
43
|
# sentences as keys and tokens arrays as values.
|
46
44
|
#
|
47
|
-
def extract(sentences)
|
45
|
+
def extract(sentences, collection = tokens)
|
48
46
|
Hash[
|
49
47
|
sentences.map do |s|
|
50
|
-
[s,
|
51
|
-
end
|
52
|
-
]
|
53
|
-
end
|
54
|
-
|
55
|
-
# Extract subsentences from the set of sentences.
|
56
|
-
#
|
57
|
-
# @param sentences [Array<Greeb::Entity>] a list of sentences.
|
58
|
-
#
|
59
|
-
# @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
|
60
|
-
# sentences as keys and subsentences arrays as values.
|
61
|
-
#
|
62
|
-
def subextract(sentences)
|
63
|
-
Hash[
|
64
|
-
sentences.map do |s|
|
65
|
-
[s, subsentences.select { |ss| ss.from >= s.from and ss.to <= s.to }]
|
48
|
+
[s, collection.select { |t| t.from >= s.from and t.to <= s.to }]
|
66
49
|
end
|
67
50
|
]
|
68
51
|
end
|
69
52
|
|
70
53
|
protected
|
71
|
-
# Implementation of the
|
72
|
-
# changes the `@sentences` ivar.
|
54
|
+
# Implementation of the entity detection method.
|
73
55
|
#
|
74
|
-
# @
|
56
|
+
# @param sample [Greeb::Entity] a sample of entity to be cloned in the
|
57
|
+
# process.
|
58
|
+
# @param stop_marks [Array<Symbol>] an array that stores the
|
59
|
+
# correspondent stop marks of the necessary entities.
|
75
60
|
#
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
if !sentence.from and SENTENCE_DOESNT_START.include?(token.type)
|
81
|
-
next sentence
|
82
|
-
end
|
83
|
-
|
84
|
-
sentence.from = token.from unless sentence.from
|
85
|
-
|
86
|
-
next sentence if sentence.to and sentence.to > token.to
|
61
|
+
# @return [Array<Greeb::Entity>] a set of entites.
|
62
|
+
#
|
63
|
+
def detect_entities(sample, stop_marks)
|
64
|
+
collection = []
|
87
65
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
66
|
+
rest = tokens.inject(sample.dup) do |entity, token|
|
67
|
+
next entity if sentence_does_not_start? entity, token
|
68
|
+
entity.from = token.from unless entity.from
|
69
|
+
next entity if entity.to and entity.to > token.to
|
92
70
|
|
93
|
-
|
94
|
-
|
71
|
+
if stop_marks.include? token.type
|
72
|
+
entity.to = find_forward(tokens, token).to
|
73
|
+
collection << entity
|
74
|
+
entity = sample.dup
|
95
75
|
elsif :separ != token.type
|
96
|
-
|
76
|
+
entity.to = token.to
|
97
77
|
end
|
98
78
|
|
99
|
-
|
79
|
+
entity
|
100
80
|
end
|
101
81
|
|
102
|
-
|
82
|
+
if rest.from && rest.to
|
83
|
+
collection << rest
|
84
|
+
else
|
85
|
+
collection
|
86
|
+
end
|
103
87
|
end
|
104
88
|
|
105
|
-
|
106
|
-
#
|
89
|
+
private
|
90
|
+
# Check the possibility of starting a new sentence by the specified
|
91
|
+
# pair of entity and token.
|
107
92
|
#
|
108
|
-
# @
|
93
|
+
# @param entity [Greeb::Entity] an entity to be checked.
|
94
|
+
# @param token [Greeb::Entity] an token to be checked.
|
109
95
|
#
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
next subsentence
|
116
|
-
end
|
117
|
-
|
118
|
-
subsentence.from = token.from unless subsentence.from
|
119
|
-
|
120
|
-
next subsentence if subsentence.to && subsentence.to > token.to
|
121
|
-
|
122
|
-
if [:punct, :spunct].include? token.type
|
123
|
-
subsentence.to = tokens.
|
124
|
-
select { |t| t.from >= token.from }.
|
125
|
-
inject(token) { |r, t| break r if t.type != token.type; t }.to
|
126
|
-
|
127
|
-
@subsentences << subsentence
|
128
|
-
subsentence = new_subsentence
|
129
|
-
elsif :separ != token.type
|
130
|
-
subsentence.to = token.to
|
131
|
-
end
|
132
|
-
|
133
|
-
subsentence
|
134
|
-
end
|
96
|
+
# @return true or false.
|
97
|
+
#
|
98
|
+
def sentence_does_not_start?(entity, token)
|
99
|
+
!entity.from and SENTENCE_DOES_NOT_START.include? token.type
|
100
|
+
end
|
135
101
|
|
136
|
-
|
102
|
+
# Find a forwarding token that has another type.
|
103
|
+
#
|
104
|
+
# @param collection [Array<Greeb::Entity>] array of possible tokens.
|
105
|
+
# @param sample [Greeb::Entity] a token that is treated as a sample.
|
106
|
+
#
|
107
|
+
# @return [Greeb::Entity] a forwarding token.
|
108
|
+
#
|
109
|
+
def find_forward(collection, sample)
|
110
|
+
collection.select { |t| t.from >= sample.from }.
|
111
|
+
inject(sample) { |r, t| t.type == sample.type ? t : (break r) }
|
137
112
|
end
|
138
113
|
|
139
|
-
private
|
140
114
|
# Create a new instance of {Greeb::Entity} with `:sentence` type.
|
141
115
|
#
|
142
116
|
# @return [Greeb::Entity] a new entity instance.
|
data/lib/greeb/tokenizer.rb
CHANGED
@@ -49,14 +49,7 @@ module Greeb::Tokenizer
|
|
49
49
|
scanner = Greeb::StringScanner.new(text)
|
50
50
|
tokens = []
|
51
51
|
while !scanner.eos?
|
52
|
-
|
53
|
-
parse! scanner, tokens, FLOATS, :float or
|
54
|
-
parse! scanner, tokens, INTEGERS, :integer or
|
55
|
-
split_parse! scanner, tokens, SENTENCE_PUNCTUATIONS, :spunct or
|
56
|
-
split_parse! scanner, tokens, PUNCTUATIONS, :punct or
|
57
|
-
split_parse! scanner, tokens, SEPARATORS, :separ or
|
58
|
-
split_parse! scanner, tokens, BREAKS, :break or
|
59
|
-
parse! scanner, tokens, RESIDUALS, :residual or
|
52
|
+
step scanner, tokens or
|
60
53
|
raise Greeb::UnknownEntity.new(text, scanner.char_pos)
|
61
54
|
end
|
62
55
|
tokens
|
@@ -64,7 +57,25 @@ module Greeb::Tokenizer
|
|
64
57
|
scanner.terminate
|
65
58
|
end
|
66
59
|
|
67
|
-
|
60
|
+
protected
|
61
|
+
# One iteration of the tokenization process.
|
62
|
+
#
|
63
|
+
# @param scanner [Greeb::StringScanner] string scanner.
|
64
|
+
# @param tokens [Array<Greeb::Entity>] result array.
|
65
|
+
#
|
66
|
+
# @return [Array<Greeb::Entity>] the modified set of extracted tokens.
|
67
|
+
#
|
68
|
+
def step scanner, tokens
|
69
|
+
parse! scanner, tokens, LETTERS, :letter or
|
70
|
+
parse! scanner, tokens, FLOATS, :float or
|
71
|
+
parse! scanner, tokens, INTEGERS, :integer or
|
72
|
+
split_parse! scanner, tokens, SENTENCE_PUNCTUATIONS, :spunct or
|
73
|
+
split_parse! scanner, tokens, PUNCTUATIONS, :punct or
|
74
|
+
split_parse! scanner, tokens, SEPARATORS, :separ or
|
75
|
+
split_parse! scanner, tokens, BREAKS, :break or
|
76
|
+
parse! scanner, tokens, RESIDUALS, :residual
|
77
|
+
end
|
78
|
+
|
68
79
|
# Try to parse one small piece of text that is covered by pattern
|
69
80
|
# of necessary type.
|
70
81
|
#
|
@@ -99,9 +110,23 @@ module Greeb::Tokenizer
|
|
99
110
|
def split_parse! scanner, tokens, pattern, type
|
100
111
|
return false unless token = scanner.scan(pattern)
|
101
112
|
position = scanner.char_pos - token.length
|
102
|
-
token
|
113
|
+
split(token).inject(position) do |before, s|
|
103
114
|
tokens << Greeb::Entity.new(before, before + s.length, type)
|
104
115
|
before + s.length
|
105
116
|
end
|
106
117
|
end
|
118
|
+
|
119
|
+
# Split one line into characters array, but also combine line breaks
|
120
|
+
# into single elements.
|
121
|
+
#
|
122
|
+
# For instance, `"a b\n\n\nc"` would be transformed into the following
|
123
|
+
# array: `["a", " ", "b", "\n\n\n", "c"]`.
|
124
|
+
#
|
125
|
+
# @param token [String] a token to be splitted.
|
126
|
+
#
|
127
|
+
# @return [Array<String>] splitted characters.
|
128
|
+
#
|
129
|
+
def split(token)
|
130
|
+
token.scan(/((.|\n)\2*)/).map(&:first)
|
131
|
+
end
|
107
132
|
end
|
data/lib/greeb/version.rb
CHANGED
data/spec/segmentator_spec.rb
CHANGED
@@ -72,8 +72,9 @@ module Greeb
|
|
72
72
|
describe 'sentence extractor' do
|
73
73
|
let(:tokens) { Tokenizer.tokenize('Hello! I am JC Denton.') }
|
74
74
|
let(:segmentator) { Segmentator.new(tokens) }
|
75
|
+
let(:sentences) { segmentator.sentences }
|
75
76
|
|
76
|
-
subject { segmentator.extract(
|
77
|
+
subject { segmentator.extract(sentences) }
|
77
78
|
|
78
79
|
it 'should be extracted' do
|
79
80
|
subject.must_equal(
|
@@ -98,8 +99,10 @@ module Greeb
|
|
98
99
|
describe 'subsentence extractor' do
|
99
100
|
let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton.') }
|
100
101
|
let(:segmentator) { Segmentator.new(tokens) }
|
102
|
+
let(:sentences) { segmentator.sentences }
|
103
|
+
let(:subsentences) { segmentator.subsentences }
|
101
104
|
|
102
|
-
subject { segmentator.
|
105
|
+
subject { segmentator.extract(sentences, subsentences) }
|
103
106
|
|
104
107
|
it 'should extract subsentences' do
|
105
108
|
subject.must_equal(
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: greeb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.0.
|
4
|
+
version: 0.2.0.rc1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Ustalov
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-05-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -88,7 +88,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
88
88
|
version: 1.3.1
|
89
89
|
requirements: []
|
90
90
|
rubyforge_project: greeb
|
91
|
-
rubygems_version: 2.0.
|
91
|
+
rubygems_version: 2.0.3
|
92
92
|
signing_key:
|
93
93
|
specification_version: 4
|
94
94
|
summary: Greeb is a simple Unicode-aware regexp-based tokenizer.
|
@@ -99,4 +99,3 @@ test_files:
|
|
99
99
|
- spec/spec_helper.rb
|
100
100
|
- spec/support/invoker.rb
|
101
101
|
- spec/tokenizer_spec.rb
|
102
|
-
has_rdoc:
|