llt-tokenizer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,361 @@
1
+ require 'spec_helper'
2
+
3
+ describe LLT::Tokenizer do
4
+ before(:all) { LLT::DbHandler::Stub.setup }
5
+
6
+ let(:stub_db) { LLT::DbHandler::Stub.new }
7
+ let(:tokenizer) { LLT::Tokenizer.new(db: stub_db) }
8
+ let(:txt) { "Atque Sex. et M. Cicero." }
9
+ let(:long_text) { "C. Caesar Antoniusque ratione superavit." }
10
+ let(:date) { "a. d. V Kal. Apr." }
11
+
12
+ context "with default options" do
13
+ describe "#tokenize" do
14
+ it "tokenizes a string" do
15
+ # tokens are C. Caesar -que Antonius ratione superavit .
16
+ # require 'pry'; binding.pry
17
+ tokenizer.tokenize(long_text).should have(7).items
18
+ end
19
+
20
+ it "handles all kinds of parens as well as cruces" do
21
+ txt = "Marcus (et Claudius) †amici† [sunt]."
22
+ tokens = tokenizer.tokenize(txt)
23
+ tokens.should have(12).items
24
+ tokens.map(&:to_s).should == %w{ Marcus ( et Claudius ) † amici † [ sunt ] . }
25
+ end
26
+
27
+ describe "takes an optional keyword argument add_to" do
28
+ class SentenceDummy
29
+ attr_reader :tokens
30
+ def initialize; @tokens = []; end
31
+ def <<(tokens); @tokens += tokens; end
32
+ end
33
+
34
+ it "adds the result to the given object if #<< is implemented" do
35
+ sentence = SentenceDummy.new
36
+ t = tokenizer.tokenize("est.", add_to: sentence)
37
+ sentence.tokens.should == t
38
+ end
39
+
40
+ it "does nothing to the given object when #<< it does not respond to" do
41
+ object = double(respond_to?: false)
42
+ object.should_not receive(:<<)
43
+ tokenizer.tokenize("est.", add_to: object)
44
+ end
45
+
46
+ it "returns an empty if the argument is an empty string" do
47
+ tokenizer.tokenize("").should == []
48
+ end
49
+
50
+ it "raises an error if argument is not a string" do
51
+ expect { tokenizer.tokenize([]) }.to raise_error ArgumentError
52
+ end
53
+
54
+ context "with quantified text" do
55
+ it "handles unshifted" do
56
+ txt = 'M. Cicero pecūniam gaudĭămquĕ incolīs dabit.'
57
+ tokens = tokenizer.tokenize(txt, shifting: false).map(&:to_s)
58
+ tokens.should == %w{ M. Cicero pecūniam gaudĭăm -quĕ incolīs dabit . }
59
+ end
60
+
61
+ it "handles shifted" do
62
+ txt = 'M. Cicero pecūniam gaudĭămquĕ incolīs dabit.'
63
+ tokens = tokenizer.tokenize(txt, shifting: true).map(&:to_s)
64
+ tokens.should == %w{ M. Cicero pecūniam -quĕ gaudĭăm incolīs dabit . }
65
+ end
66
+
67
+ it "handles double-shifted" do
68
+ txt = 'M. Cicero pecūniam Italia in eoquĕ dabit.'
69
+ tokens = tokenizer.tokenize(txt, shifting: true).map(&:to_s)
70
+ tokens.should == %w{ M. Cicero pecūniam Italia -quĕ in eo dabit . }
71
+ end
72
+
73
+ it "handles merging" do
74
+ txt = 'Quăm diu M. Cicero pecūniam Italia dabit.'
75
+ tokens = tokenizer.tokenize(txt, shifting: true).map(&:to_s)
76
+ tokens.should == %w{ Quămdiu M. Cicero pecūniam Italia dabit . }
77
+ end
78
+ end
79
+
80
+ context "with more exotic punctuation" do
81
+ it "handles -- as single Punctuation token" do
82
+ txt = 'Arma -- virum -- cano.'
83
+ tokens = tokenizer.tokenize(txt)
84
+ tokens.should have(6).items
85
+ end
86
+
87
+ it "handles ?! as two separate tokens" do
88
+ txt = 'Arma cano!?'
89
+ tokens = tokenizer.tokenize(txt)
90
+ tokens.should have(4).items
91
+ end
92
+
93
+ context "handles direct speech delimiters" do
94
+ it "'" do
95
+ txt = "'Arma', inquit 'cano'."
96
+ tokens = tokenizer.tokenize(txt)
97
+ tokens.should have(9).items
98
+ end
99
+
100
+ it '"' do
101
+ txt = '"Arma" inquit "cano".'
102
+ tokens = tokenizer.tokenize(txt)
103
+ tokens.should have(8).items
104
+ end
105
+
106
+ it '”' do
107
+ txt = '”Arma” inquit ”cano”.'
108
+ tokens = tokenizer.tokenize(txt)
109
+ tokens.should have(8).items
110
+ end
111
+ end
112
+ end
113
+
114
+ context "with embedded xml tags" do
115
+ it "doesn't break" do
116
+ txt = '<grc>text text</grc>'
117
+ tokens = tokenizer.tokenize(txt)
118
+ tokens.should have(4).items
119
+ end
120
+ end
121
+ end
122
+ end
123
+
124
+ describe "#find_abbreviations_and_join_strings" do
125
+ describe "should bring back abbreviation dots" do
126
+ it "with names" do
127
+ tokenizer.setup("", {}, %w{ Atque Sex . et M . Cicero . })
128
+ tokenizer.find_abbreviations_and_join_strings
129
+ tokenizer.preliminary.should == %w{ Atque Sex. et M. Cicero . }
130
+ end
131
+
132
+ it "with roman date" do
133
+ tokenizer.setup("", {}, %w{ a . d . V Kal . Apr . })
134
+ tokenizer.find_abbreviations_and_join_strings
135
+ tokenizer.preliminary.should == %w{ a. d. V Kal. Apr. }
136
+ end
137
+ end
138
+ end
139
+
140
+ describe "#split_enklitika_and_change_their_position" do
141
+ def enklitika_test(example)
142
+ tokenizer.setup("", {}, example.split)
143
+ tokenizer.split_enklitika_and_change_their_position
144
+ tokenizer.preliminary
145
+ end
146
+
147
+ context "when confronted with -que" do
148
+ # even if should_not be splitted would be more expressive,
149
+ # use only positive expectation as it gives more detailed feedback
150
+ examples = {
151
+ "laetusque" => "-que laetus",
152
+ "in eoque" => "-que in eo",
153
+ "In eoque" => "-que In eo",
154
+ "ad eamque" => "-que ad eam",
155
+ "ob easque" => "-que ob eas",
156
+ "neque" => "-que ne",
157
+ "nec" => "-c ne",
158
+ "Atque" => "Atque",
159
+ "atque" => "atque",
160
+ "cuiusque" => "cuiusque",
161
+ "denique" => "denique",
162
+ "itaque" => "itaque",
163
+ "plerumque" => "plerumque",
164
+ "quaque" => "quaque",
165
+ "quemque" => "quemque",
166
+ "undique" => "undique",
167
+ "uterque" => "uterque",
168
+ "utriusque" => "utriusque",
169
+ "utcumque" => "utcumque",
170
+ "bonus laetusque et latus altusque" => "bonus -que laetus et latus -que altus",
171
+ }
172
+
173
+ examples.each do |example, expected|
174
+ it "transforms #{example} to #{expected}" do
175
+ enklitika_test(example).should be_transformed_to expected
176
+ end
177
+ end
178
+ end
179
+
180
+ context "when confronted with -ne" do
181
+ examples = {
182
+ "honestumne" => "-ne honestum",
183
+ "omniane" => "-ne omnia",
184
+
185
+ # frequent patterns in third declension nouns
186
+ "ratione" => "ratione",
187
+ "magnitudine" => "magnitudine",
188
+ "Platone" => "Platone",
189
+ "libidine" => "libidine",
190
+ "Solone" => "Solone",
191
+ "homine" => "homine",
192
+ "flumine" => "flumine",
193
+
194
+ # frequent patterns in third declension adjective
195
+ "commune" => "commune",
196
+ "Commune" => "Commune",
197
+
198
+ # filtered by restrictor array
199
+ "omne" => "omne",
200
+ "sine" => "sine",
201
+ "bene" => "bene",
202
+ "paene" => "paene",
203
+
204
+ # ne itself should be contained
205
+ "ne" => "ne",
206
+ }
207
+
208
+ examples.each do |example, expected|
209
+ it "transforms #{example} to #{expected}" do
210
+ enklitika_test(example).should be_transformed_to expected
211
+ end
212
+ end
213
+ end
214
+
215
+ context "when confronted with -ve" do
216
+ examples = {
217
+ 'sive' => 'sive',
218
+ 'pluresve' => '-ve plures',
219
+ 'aestive' => 'aestive',
220
+ 'serve' => 'serve',
221
+ 'suave' => 'suave',
222
+ 'vive' => 'vive',
223
+ 'move' => 'move',
224
+ 'neve' => 'neve'
225
+ }
226
+
227
+ examples.each do |example, expected|
228
+ it "transforms #{example} to #{expected}" do
229
+ enklitika_test(example).should be_transformed_to expected
230
+ end
231
+ end
232
+ end
233
+ end
234
+
235
+ describe "#merge_what_needs_merging" do
236
+ subject do
237
+ tokenizer.setup("", {}, self.class.description.split)
238
+ tokenizer.merge_what_needs_merging
239
+ tokenizer.preliminary
240
+ end
241
+
242
+ describe("quam diu") { it { should be_transformed_to "quamdiu" } }
243
+ describe("Quam diu") { it { should be_transformed_to "Quamdiu" } }
244
+ describe("erat diu") { it { should_not be_transformed_to "eratdiu" } }
245
+ describe("non nullis") { it { should be_transformed_to "nonnullis" } }
246
+ end
247
+
248
+ describe "#create_tokens" do
249
+ def sentence_element_test(example)
250
+ tokenizer.setup("", {}, example.split)
251
+ tokenizer.create_tokens.first
252
+ end
253
+
254
+ examples = {
255
+ "Word" => %w{ ita Marcus quoque -que },
256
+ "Filler" => %w{ M. Sex. App. Ap. Tib. Ti. C. a. d. Kal. Ian. }, #I XI MMC }
257
+ "XmlTag" => %w{ <grc> </grc> },
258
+ "Punctuation" => %w{ , . ! ? † ( ) [ ] ... -- ” " ' }
259
+ }
260
+
261
+ examples.each do |klass, elements|
262
+ elements.each do |e|
263
+ it "#{e} is a LLT::#{klass}" do
264
+ sentence_element_test(e).should be_an_instance_of LLT::Token.const_get(klass)
265
+ end
266
+ end
267
+ end
268
+
269
+ it "handles complex xml tags with attributes as well" do
270
+ tokenizer.setup('', {}, ['<foreign lang="grc">'])
271
+ tokenizer.create_tokens.first.should be_an_instance_of LLT::Token::XmlTag
272
+ end
273
+ end
274
+
275
+ it "attaches id's to tokens" do
276
+ txt = 'Cano.'
277
+ tokens = tokenizer.tokenize(txt)
278
+ tokens.map(&:id).should == [1, 2]
279
+ end
280
+
281
+ it "can be disabled" do
282
+ txt = 'Cano.'
283
+ tokens = tokenizer.tokenize(txt, indexing: false)
284
+ tokens.map(&:id).should == [nil, nil]
285
+ end
286
+
287
+ it "doesn't count plain xml tags" do
288
+ txt = '<grc>text text</grc>'
289
+ tokens = tokenizer.tokenize(txt)
290
+ tokens.map(&:id).should == [nil, 1, 2, nil]
291
+ end
292
+
293
+ it "doesn't count xml tags when they come with attributes" do
294
+ txt = '<foreign lang="lat">Graeca</foreign> lingua est.'
295
+ tokens = tokenizer.tokenize(txt).map(&:to_s)
296
+ res = ['<foreign lang="lat">', 'Graeca', '</foreign>', 'lingua', 'est', '.']
297
+ tokens.should == res
298
+ end
299
+
300
+ it "handles nested xml as well" do
301
+ txt = '<l n="70"><foreign lang="lat">Graeca lingua est.</foreign></l>'
302
+ tokens = tokenizer.tokenize(txt).map(&:to_s)
303
+ res = ['<l n="70">', '<foreign lang="lat">', 'Graeca', 'lingua', 'est', '.', '</foreign>', '</l>']
304
+ tokens.should == res
305
+ end
306
+
307
+ it "handles text with broken off xml tags (the rest will e.g. be in another sentence)" do
308
+ txt = "<lg org=\"uniform\" sample=\"complete\"><l>quem vocet divum populus ruentis</l><l>imperi rebus?"
309
+ tokens = tokenizer.tokenize(txt)
310
+ tokens.should have(12).items
311
+ end
312
+ end
313
+
314
+ context "with options" do
315
+ describe "#tokenize" do
316
+ context "with custom enclitics marker" do
317
+ it "uses the given marker" do
318
+ txt = 'Arma virumque cano.'
319
+ opts = { enclitics_marker: '--' }
320
+ tokens = tokenizer.tokenize(txt, opts)
321
+ tokens.map(&:to_s).should == %w{ Arma --que virum cano . }
322
+ end
323
+ end
324
+
325
+ context "with disabled shifting" do
326
+ it "doesn't shift" do
327
+ txt = 'Arma virumque in carmina et in eoque cano.'
328
+ opts = { shifting: false }
329
+ tokens = tokenizer.tokenize(txt, opts).map(&:to_s)
330
+ tokens.should == %w{ Arma virum -que in carmina et in eo -que cano . }
331
+ end
332
+
333
+ it "doesn't shift (complex)" do
334
+ txt = 'ratione arma virumque cano.'
335
+ opts = { shifting: false }
336
+ tokens = tokenizer.tokenize(txt, opts).map(&:to_s)
337
+ tokens.should == %w{ ratione arma virum -que cano . }
338
+ end
339
+ end
340
+
341
+ context "with disabled merging" do
342
+ it "doesn't merge things like quam diu" do
343
+ txt = 'quam diu cano?'
344
+ opts = { merging: false }
345
+ tokens = tokenizer.tokenize(txt, opts).map(&:to_s)
346
+ tokens.should == %w{ quam diu cano ? }
347
+ end
348
+ end
349
+ end
350
+ end
351
+
352
+ context "with options on instance creation" do
353
+ it "a new instance can receive options, which it will use as it's defaults" do
354
+ custom_tok = LLT::Tokenizer.new(db: stub_db,
355
+ shifting: false,
356
+ enclitics_marker: '')
357
+ tokens = custom_tok.tokenize('Arma virumque cano.').map(&:to_s)
358
+ tokens.should == %w{ Arma virum que cano . }
359
+ end
360
+ end
361
+ end
@@ -0,0 +1,28 @@
1
+ require 'simplecov'
2
+ require 'coveralls'
3
+
4
+ Coveralls.wear!
5
+
6
+ SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
7
+ SimpleCov::Formatter::HTMLFormatter,
8
+ Coveralls::SimpleCov::Formatter
9
+ ]
10
+
11
+ SimpleCov.start do
12
+ add_filter '/spec/'
13
+ end
14
+
15
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
16
+ require 'llt/tokenizer'
17
+ require 'llt/db_handler/stub'
18
+ require 'support/matchers/tokenizer'
19
+
20
+ if defined?(LLT::Logger)
21
+ LLT::Logger.level = nil
22
+ end
23
+
24
+ RSpec.configure do |config|
25
+ config.treat_symbols_as_metadata_keys_with_true_values = true
26
+ config.run_all_when_everything_filtered = true
27
+ config.filter_run :focus
28
+ end
@@ -0,0 +1,5 @@
1
+ RSpec::Matchers.define :be_transformed_to do |expected|
2
+ match do |actual|
3
+ actual == expected.split
4
+ end
5
+ end
metadata ADDED
@@ -0,0 +1,195 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: llt-tokenizer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - LFDM
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-12-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: array_scanner
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.3'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.3'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: simplecov
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0.7'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0.7'
83
+ - !ruby/object:Gem::Dependency
84
+ name: llt-core
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: llt-core_extensions
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: llt-db_handler
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: llt-helpers
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ description: LLT's Tokenizer
140
+ email:
141
+ - 1986gh@gmail.com
142
+ executables: []
143
+ extensions: []
144
+ extra_rdoc_files: []
145
+ files:
146
+ - ".gitignore"
147
+ - ".rspec"
148
+ - ".travis.yml"
149
+ - Gemfile
150
+ - LICENSE.txt
151
+ - README.md
152
+ - Rakefile
153
+ - lib/llt/token.rb
154
+ - lib/llt/token/filler.rb
155
+ - lib/llt/token/punctuation.rb
156
+ - lib/llt/token/word.rb
157
+ - lib/llt/token/xml_tag.rb
158
+ - lib/llt/tokenizer.rb
159
+ - lib/llt/tokenizer/api.rb
160
+ - lib/llt/tokenizer/version.rb
161
+ - lib/llt/tokenizer/worker.rb
162
+ - llt-tokenizer.gemspec
163
+ - spec/lib/llt/tokenizer/api_spec.rb
164
+ - spec/lib/llt/tokenizer_spec.rb
165
+ - spec/spec_helper.rb
166
+ - spec/support/matchers/tokenizer.rb
167
+ homepage: ''
168
+ licenses:
169
+ - MIT
170
+ metadata: {}
171
+ post_install_message:
172
+ rdoc_options: []
173
+ require_paths:
174
+ - lib
175
+ required_ruby_version: !ruby/object:Gem::Requirement
176
+ requirements:
177
+ - - ">="
178
+ - !ruby/object:Gem::Version
179
+ version: '0'
180
+ required_rubygems_version: !ruby/object:Gem::Requirement
181
+ requirements:
182
+ - - ">="
183
+ - !ruby/object:Gem::Version
184
+ version: '0'
185
+ requirements: []
186
+ rubyforge_project:
187
+ rubygems_version: 2.1.5
188
+ signing_key:
189
+ specification_version: 4
190
+ summary: Breaks latin sentences into tokens
191
+ test_files:
192
+ - spec/lib/llt/tokenizer/api_spec.rb
193
+ - spec/lib/llt/tokenizer_spec.rb
194
+ - spec/spec_helper.rb
195
+ - spec/support/matchers/tokenizer.rb