llt-tokenizer 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,361 @@
1
+ require 'spec_helper'
2
+
3
+ describe LLT::Tokenizer do
4
+ before(:all) { LLT::DbHandler::Stub.setup }
5
+
6
+ let(:stub_db) { LLT::DbHandler::Stub.new }
7
+ let(:tokenizer) { LLT::Tokenizer.new(db: stub_db) }
8
+ let(:txt) { "Atque Sex. et M. Cicero." }
9
+ let(:long_text) { "C. Caesar Antoniusque ratione superavit." }
10
+ let(:date) { "a. d. V Kal. Apr." }
11
+
12
+ context "with default options" do
13
+ describe "#tokenize" do
14
+ it "tokenizes a string" do
15
+ # tokens are C. Caesar -que Antonius ratione superavit .
16
+ # require 'pry'; binding.pry
17
+ tokenizer.tokenize(long_text).should have(7).items
18
+ end
19
+
20
+ it "handles all kinds of parens as well as cruces" do
21
+ txt = "Marcus (et Claudius) †amici† [sunt]."
22
+ tokens = tokenizer.tokenize(txt)
23
+ tokens.should have(12).items
24
+ tokens.map(&:to_s).should == %w{ Marcus ( et Claudius ) † amici † [ sunt ] . }
25
+ end
26
+
27
+ describe "takes an optional keyword argument add_to" do
28
+ class SentenceDummy
29
+ attr_reader :tokens
30
+ def initialize; @tokens = []; end
31
+ def <<(tokens); @tokens += tokens; end
32
+ end
33
+
34
+ it "adds the result to the given object if #<< is implemented" do
35
+ sentence = SentenceDummy.new
36
+ t = tokenizer.tokenize("est.", add_to: sentence)
37
+ sentence.tokens.should == t
38
+ end
39
+
40
+ it "does nothing to the given object when #<< it does not respond to" do
41
+ object = double(respond_to?: false)
42
+ object.should_not receive(:<<)
43
+ tokenizer.tokenize("est.", add_to: object)
44
+ end
45
+
46
+ it "returns an empty if the argument is an empty string" do
47
+ tokenizer.tokenize("").should == []
48
+ end
49
+
50
+ it "raises an error if argument is not a string" do
51
+ expect { tokenizer.tokenize([]) }.to raise_error ArgumentError
52
+ end
53
+
54
+ context "with quantified text" do
55
+ it "handles unshifted" do
56
+ txt = 'M. Cicero pecūniam gaudĭămquĕ incolīs dabit.'
57
+ tokens = tokenizer.tokenize(txt, shifting: false).map(&:to_s)
58
+ tokens.should == %w{ M. Cicero pecūniam gaudĭăm -quĕ incolīs dabit . }
59
+ end
60
+
61
+ it "handles shifted" do
62
+ txt = 'M. Cicero pecūniam gaudĭămquĕ incolīs dabit.'
63
+ tokens = tokenizer.tokenize(txt, shifting: true).map(&:to_s)
64
+ tokens.should == %w{ M. Cicero pecūniam -quĕ gaudĭăm incolīs dabit . }
65
+ end
66
+
67
+ it "handles double-shifted" do
68
+ txt = 'M. Cicero pecūniam Italia in eoquĕ dabit.'
69
+ tokens = tokenizer.tokenize(txt, shifting: true).map(&:to_s)
70
+ tokens.should == %w{ M. Cicero pecūniam Italia -quĕ in eo dabit . }
71
+ end
72
+
73
+ it "handles merging" do
74
+ txt = 'Quăm diu M. Cicero pecūniam Italia dabit.'
75
+ tokens = tokenizer.tokenize(txt, shifting: true).map(&:to_s)
76
+ tokens.should == %w{ Quămdiu M. Cicero pecūniam Italia dabit . }
77
+ end
78
+ end
79
+
80
+ context "with more exotic punctuation" do
81
+ it "handles -- as single Punctuation token" do
82
+ txt = 'Arma -- virum -- cano.'
83
+ tokens = tokenizer.tokenize(txt)
84
+ tokens.should have(6).items
85
+ end
86
+
87
+ it "handles ?! as two separate tokens" do
88
+ txt = 'Arma cano!?'
89
+ tokens = tokenizer.tokenize(txt)
90
+ tokens.should have(4).items
91
+ end
92
+
93
+ context "handles direct speech delimiters" do
94
+ it "'" do
95
+ txt = "'Arma', inquit 'cano'."
96
+ tokens = tokenizer.tokenize(txt)
97
+ tokens.should have(9).items
98
+ end
99
+
100
+ it '"' do
101
+ txt = '"Arma" inquit "cano".'
102
+ tokens = tokenizer.tokenize(txt)
103
+ tokens.should have(8).items
104
+ end
105
+
106
+ it '”' do
107
+ txt = '”Arma” inquit ”cano”.'
108
+ tokens = tokenizer.tokenize(txt)
109
+ tokens.should have(8).items
110
+ end
111
+ end
112
+ end
113
+
114
+ context "with embedded xml tags" do
115
+ it "doesn't break" do
116
+ txt = '<grc>text text</grc>'
117
+ tokens = tokenizer.tokenize(txt)
118
+ tokens.should have(4).items
119
+ end
120
+ end
121
+ end
122
+ end
123
+
124
+ describe "#find_abbreviations_and_join_strings" do
125
+ describe "should bring back abbreviation dots" do
126
+ it "with names" do
127
+ tokenizer.setup("", {}, %w{ Atque Sex . et M . Cicero . })
128
+ tokenizer.find_abbreviations_and_join_strings
129
+ tokenizer.preliminary.should == %w{ Atque Sex. et M. Cicero . }
130
+ end
131
+
132
+ it "with roman date" do
133
+ tokenizer.setup("", {}, %w{ a . d . V Kal . Apr . })
134
+ tokenizer.find_abbreviations_and_join_strings
135
+ tokenizer.preliminary.should == %w{ a. d. V Kal. Apr. }
136
+ end
137
+ end
138
+ end
139
+
140
+ describe "#split_enklitika_and_change_their_position" do
141
+ def enklitika_test(example)
142
+ tokenizer.setup("", {}, example.split)
143
+ tokenizer.split_enklitika_and_change_their_position
144
+ tokenizer.preliminary
145
+ end
146
+
147
+ context "when confronted with -que" do
148
+ # even if should_not be splitted would be more expressive,
149
+ # use only positive expectation as it gives more detailed feedback
150
+ examples = {
151
+ "laetusque" => "-que laetus",
152
+ "in eoque" => "-que in eo",
153
+ "In eoque" => "-que In eo",
154
+ "ad eamque" => "-que ad eam",
155
+ "ob easque" => "-que ob eas",
156
+ "neque" => "-que ne",
157
+ "nec" => "-c ne",
158
+ "Atque" => "Atque",
159
+ "atque" => "atque",
160
+ "cuiusque" => "cuiusque",
161
+ "denique" => "denique",
162
+ "itaque" => "itaque",
163
+ "plerumque" => "plerumque",
164
+ "quaque" => "quaque",
165
+ "quemque" => "quemque",
166
+ "undique" => "undique",
167
+ "uterque" => "uterque",
168
+ "utriusque" => "utriusque",
169
+ "utcumque" => "utcumque",
170
+ "bonus laetusque et latus altusque" => "bonus -que laetus et latus -que altus",
171
+ }
172
+
173
+ examples.each do |example, expected|
174
+ it "transforms #{example} to #{expected}" do
175
+ enklitika_test(example).should be_transformed_to expected
176
+ end
177
+ end
178
+ end
179
+
180
+ context "when confronted with -ne" do
181
+ examples = {
182
+ "honestumne" => "-ne honestum",
183
+ "omniane" => "-ne omnia",
184
+
185
+ # frequent patterns in third declension nouns
186
+ "ratione" => "ratione",
187
+ "magnitudine" => "magnitudine",
188
+ "Platone" => "Platone",
189
+ "libidine" => "libidine",
190
+ "Solone" => "Solone",
191
+ "homine" => "homine",
192
+ "flumine" => "flumine",
193
+
194
+ # frequent patterns in third declension adjective
195
+ "commune" => "commune",
196
+ "Commune" => "Commune",
197
+
198
+ # filtered by restrictor array
199
+ "omne" => "omne",
200
+ "sine" => "sine",
201
+ "bene" => "bene",
202
+ "paene" => "paene",
203
+
204
+ # ne itself should be contained
205
+ "ne" => "ne",
206
+ }
207
+
208
+ examples.each do |example, expected|
209
+ it "transforms #{example} to #{expected}" do
210
+ enklitika_test(example).should be_transformed_to expected
211
+ end
212
+ end
213
+ end
214
+
215
+ context "when confronted with -ve" do
216
+ examples = {
217
+ 'sive' => 'sive',
218
+ 'pluresve' => '-ve plures',
219
+ 'aestive' => 'aestive',
220
+ 'serve' => 'serve',
221
+ 'suave' => 'suave',
222
+ 'vive' => 'vive',
223
+ 'move' => 'move',
224
+ 'neve' => 'neve'
225
+ }
226
+
227
+ examples.each do |example, expected|
228
+ it "transforms #{example} to #{expected}" do
229
+ enklitika_test(example).should be_transformed_to expected
230
+ end
231
+ end
232
+ end
233
+ end
234
+
235
+ describe "#merge_what_needs_merging" do
236
+ subject do
237
+ tokenizer.setup("", {}, self.class.description.split)
238
+ tokenizer.merge_what_needs_merging
239
+ tokenizer.preliminary
240
+ end
241
+
242
+ describe("quam diu") { it { should be_transformed_to "quamdiu" } }
243
+ describe("Quam diu") { it { should be_transformed_to "Quamdiu" } }
244
+ describe("erat diu") { it { should_not be_transformed_to "eratdiu" } }
245
+ describe("non nullis") { it { should be_transformed_to "nonnullis" } }
246
+ end
247
+
248
+ describe "#create_tokens" do
249
+ def sentence_element_test(example)
250
+ tokenizer.setup("", {}, example.split)
251
+ tokenizer.create_tokens.first
252
+ end
253
+
254
+ examples = {
255
+ "Word" => %w{ ita Marcus quoque -que },
256
+ "Filler" => %w{ M. Sex. App. Ap. Tib. Ti. C. a. d. Kal. Ian. }, #I XI MMC }
257
+ "XmlTag" => %w{ <grc> </grc> },
258
+ "Punctuation" => %w{ , . ! ? † ( ) [ ] ... -- ” " ' }
259
+ }
260
+
261
+ examples.each do |klass, elements|
262
+ elements.each do |e|
263
+ it "#{e} is a LLT::#{klass}" do
264
+ sentence_element_test(e).should be_an_instance_of LLT::Token.const_get(klass)
265
+ end
266
+ end
267
+ end
268
+
269
+ it "handles complex xml tags with attributes as well" do
270
+ tokenizer.setup('', {}, ['<foreign lang="grc">'])
271
+ tokenizer.create_tokens.first.should be_an_instance_of LLT::Token::XmlTag
272
+ end
273
+ end
274
+
275
+ it "attaches id's to tokens" do
276
+ txt = 'Cano.'
277
+ tokens = tokenizer.tokenize(txt)
278
+ tokens.map(&:id).should == [1, 2]
279
+ end
280
+
281
+ it "can be disabled" do
282
+ txt = 'Cano.'
283
+ tokens = tokenizer.tokenize(txt, indexing: false)
284
+ tokens.map(&:id).should == [nil, nil]
285
+ end
286
+
287
+ it "doesn't count plain xml tags" do
288
+ txt = '<grc>text text</grc>'
289
+ tokens = tokenizer.tokenize(txt)
290
+ tokens.map(&:id).should == [nil, 1, 2, nil]
291
+ end
292
+
293
+ it "doesn't count xml tags when they come with attributes" do
294
+ txt = '<foreign lang="lat">Graeca</foreign> lingua est.'
295
+ tokens = tokenizer.tokenize(txt).map(&:to_s)
296
+ res = ['<foreign lang="lat">', 'Graeca', '</foreign>', 'lingua', 'est', '.']
297
+ tokens.should == res
298
+ end
299
+
300
+ it "handles nested xml as well" do
301
+ txt = '<l n="70"><foreign lang="lat">Graeca lingua est.</foreign></l>'
302
+ tokens = tokenizer.tokenize(txt).map(&:to_s)
303
+ res = ['<l n="70">', '<foreign lang="lat">', 'Graeca', 'lingua', 'est', '.', '</foreign>', '</l>']
304
+ tokens.should == res
305
+ end
306
+
307
+ it "handles text with broken off xml tags (the rest will e.g. be in another sentence)" do
308
+ txt = "<lg org=\"uniform\" sample=\"complete\"><l>quem vocet divum populus ruentis</l><l>imperi rebus?"
309
+ tokens = tokenizer.tokenize(txt)
310
+ tokens.should have(12).items
311
+ end
312
+ end
313
+
314
+ context "with options" do
315
+ describe "#tokenize" do
316
+ context "with custom enclitics marker" do
317
+ it "uses the given marker" do
318
+ txt = 'Arma virumque cano.'
319
+ opts = { enclitics_marker: '--' }
320
+ tokens = tokenizer.tokenize(txt, opts)
321
+ tokens.map(&:to_s).should == %w{ Arma --que virum cano . }
322
+ end
323
+ end
324
+
325
+ context "with disabled shifting" do
326
+ it "doesn't shift" do
327
+ txt = 'Arma virumque in carmina et in eoque cano.'
328
+ opts = { shifting: false }
329
+ tokens = tokenizer.tokenize(txt, opts).map(&:to_s)
330
+ tokens.should == %w{ Arma virum -que in carmina et in eo -que cano . }
331
+ end
332
+
333
+ it "doesn't shift (complex)" do
334
+ txt = 'ratione arma virumque cano.'
335
+ opts = { shifting: false }
336
+ tokens = tokenizer.tokenize(txt, opts).map(&:to_s)
337
+ tokens.should == %w{ ratione arma virum -que cano . }
338
+ end
339
+ end
340
+
341
+ context "with disabled merging" do
342
+ it "doesn't merge things like quam diu" do
343
+ txt = 'quam diu cano?'
344
+ opts = { merging: false }
345
+ tokens = tokenizer.tokenize(txt, opts).map(&:to_s)
346
+ tokens.should == %w{ quam diu cano ? }
347
+ end
348
+ end
349
+ end
350
+ end
351
+
352
+ context "with options on instance creation" do
353
+ it "a new instance can receive options, which it will use as it's defaults" do
354
+ custom_tok = LLT::Tokenizer.new(db: stub_db,
355
+ shifting: false,
356
+ enclitics_marker: '')
357
+ tokens = custom_tok.tokenize('Arma virumque cano.').map(&:to_s)
358
+ tokens.should == %w{ Arma virum que cano . }
359
+ end
360
+ end
361
+ end
@@ -0,0 +1,28 @@
1
+ require 'simplecov'
2
+ require 'coveralls'
3
+
4
+ Coveralls.wear!
5
+
6
+ SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
7
+ SimpleCov::Formatter::HTMLFormatter,
8
+ Coveralls::SimpleCov::Formatter
9
+ ]
10
+
11
+ SimpleCov.start do
12
+ add_filter '/spec/'
13
+ end
14
+
15
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
16
+ require 'llt/tokenizer'
17
+ require 'llt/db_handler/stub'
18
+ require 'support/matchers/tokenizer'
19
+
20
+ if defined?(LLT::Logger)
21
+ LLT::Logger.level = nil
22
+ end
23
+
24
+ RSpec.configure do |config|
25
+ config.treat_symbols_as_metadata_keys_with_true_values = true
26
+ config.run_all_when_everything_filtered = true
27
+ config.filter_run :focus
28
+ end
@@ -0,0 +1,5 @@
1
+ RSpec::Matchers.define :be_transformed_to do |expected|
2
+ match do |actual|
3
+ actual == expected.split
4
+ end
5
+ end
metadata ADDED
@@ -0,0 +1,195 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: llt-tokenizer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - LFDM
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-12-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: array_scanner
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.3'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.3'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: simplecov
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0.7'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0.7'
83
+ - !ruby/object:Gem::Dependency
84
+ name: llt-core
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: llt-core_extensions
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: llt-db_handler
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: llt-helpers
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ description: LLT's Tokenizer
140
+ email:
141
+ - 1986gh@gmail.com
142
+ executables: []
143
+ extensions: []
144
+ extra_rdoc_files: []
145
+ files:
146
+ - ".gitignore"
147
+ - ".rspec"
148
+ - ".travis.yml"
149
+ - Gemfile
150
+ - LICENSE.txt
151
+ - README.md
152
+ - Rakefile
153
+ - lib/llt/token.rb
154
+ - lib/llt/token/filler.rb
155
+ - lib/llt/token/punctuation.rb
156
+ - lib/llt/token/word.rb
157
+ - lib/llt/token/xml_tag.rb
158
+ - lib/llt/tokenizer.rb
159
+ - lib/llt/tokenizer/api.rb
160
+ - lib/llt/tokenizer/version.rb
161
+ - lib/llt/tokenizer/worker.rb
162
+ - llt-tokenizer.gemspec
163
+ - spec/lib/llt/tokenizer/api_spec.rb
164
+ - spec/lib/llt/tokenizer_spec.rb
165
+ - spec/spec_helper.rb
166
+ - spec/support/matchers/tokenizer.rb
167
+ homepage: ''
168
+ licenses:
169
+ - MIT
170
+ metadata: {}
171
+ post_install_message:
172
+ rdoc_options: []
173
+ require_paths:
174
+ - lib
175
+ required_ruby_version: !ruby/object:Gem::Requirement
176
+ requirements:
177
+ - - ">="
178
+ - !ruby/object:Gem::Version
179
+ version: '0'
180
+ required_rubygems_version: !ruby/object:Gem::Requirement
181
+ requirements:
182
+ - - ">="
183
+ - !ruby/object:Gem::Version
184
+ version: '0'
185
+ requirements: []
186
+ rubyforge_project:
187
+ rubygems_version: 2.1.5
188
+ signing_key:
189
+ specification_version: 4
190
+ summary: Breaks latin sentences into tokens
191
+ test_files:
192
+ - spec/lib/llt/tokenizer/api_spec.rb
193
+ - spec/lib/llt/tokenizer_spec.rb
194
+ - spec/spec_helper.rb
195
+ - spec/support/matchers/tokenizer.rb