llt-tokenizer 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 17d75e05ecdf64cd93e8de239e57652b4a6375ba
4
- data.tar.gz: b21ab0737044b952a9642a68a9aae48a4d615d4e
3
+ metadata.gz: 9a9abfc5e79b148f497749053c8ccfa7ac9653af
4
+ data.tar.gz: 1c9fe20eb2824eccc1840602beae6552415eb5d2
5
5
  SHA512:
6
- metadata.gz: afadf19b6eb1a7f45dca79631efa57ceb7359d286523d2979658208decffea5d094fdcd4fcc9b3e02f788390bd3e2eeb28c415f3ad7ea97efcdbdf28602c7d2d
7
- data.tar.gz: f752b02680802aafb6305490b139dc0b9ca2e3e35e9b48ca2610f2039e57959571ac432e2b7fa3b4b9e1f05abdc54c9ef8e99f42a2927a2ee25294ecad908fd9
6
+ metadata.gz: 3cd367d754d75f895240c709aed9697140c8359490bc634e56f118b77cc015c2a08c80d7fa4fa74448084844beec4749a7b01b1789c0805a3a5a8fa8d465d5e9
7
+ data.tar.gz: 21c50a75955cab805fb81bc1435963e047171936c015981121de1405378fb4af9c21a69153c0c043d3a504986e1022437690cedb60b88e0b8246ca6fce20565b
data/.travis.yml CHANGED
@@ -4,4 +4,4 @@ before_script:
4
4
  rvm:
5
5
  - 2.1.0
6
6
  - 2.0.0
7
- - jruby-20mode
7
+ - jruby-1.7.8
data/Gemfile CHANGED
@@ -5,15 +5,15 @@ gemspec
5
5
 
6
6
  gem 'coveralls', require: false
7
7
 
8
- gem 'llt-core', git: 'git@github.com:latin-language-toolkit/llt-core.git'
9
- gem 'llt-core_extensions', git: 'git@github.com:latin-language-toolkit/llt-core_extensions.git'
10
- gem 'llt-constants', git: 'git@github.com:latin-language-toolkit/llt-constants.git'
11
- gem 'llt-db_handler', git: 'git@github.com:latin-language-toolkit/llt-db_handler.git'
12
- gem 'llt-db_handler-stub', git: 'git@github.com:latin-language-toolkit/llt-db_handler-stub.git'
13
- gem 'llt-helpers', git: 'git@github.com:latin-language-toolkit/llt-helpers.git'
8
+ gem 'llt-core', git: 'git://github.com/latin-language-toolkit/llt-core.git'
9
+ gem 'llt-core_extensions', git: 'git://github.com/latin-language-toolkit/llt-core_extensions.git'
10
+ gem 'llt-constants', git: 'git://github.com/latin-language-toolkit/llt-constants.git'
11
+ gem 'llt-db_handler', git: 'git://github.com/latin-language-toolkit/llt-db_handler.git'
12
+ gem 'llt-db_handler-stub', git: 'git://github.com/latin-language-toolkit/llt-db_handler-stub.git'
13
+ gem 'llt-helpers', git: 'git://github.com/latin-language-toolkit/llt-helpers.git'
14
14
 
15
15
  # Dependencies of db_handler
16
- gem 'llt-form_builder', git: 'git@github.com:latin-language-toolkit/llt-form_builder.git'
16
+ gem 'llt-form_builder', git: 'git://github.com/latin-language-toolkit/llt-form_builder.git'
17
17
 
18
18
  platform :ruby do
19
19
  gem 'pg'
@@ -23,5 +23,3 @@ platform :jruby do
23
23
  gem 'activerecord-jdbcpostgresql-adapter'
24
24
  gem 'jruby-httpclient'
25
25
  end
26
-
27
- gem 'pry'
data/README.md CHANGED
@@ -1,5 +1,11 @@
1
1
  # LLT::Tokenizer
2
2
 
3
+ [![Version](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/badge_fury.png)](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/badge_fury)
4
+ [![Dependencies](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/gemnasium.png)](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/gemnasium)
5
+ [![Build Status](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/travis.png)](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/travis)
6
+ [![Coverage](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/coveralls.png)](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/coveralls)
7
+ [![Code Climate](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/code_climate.png)](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/code_climate)
8
+
3
9
  Flexible service to tokenize Latin texts.
4
10
 
5
11
  ## Installation
data/lib/llt/token.rb CHANGED
@@ -34,6 +34,10 @@ module LLT
34
34
  @special_roles += roles
35
35
  end
36
36
 
37
+ def ==(other)
38
+ to_s.downcase == other.to_s.downcase
39
+ end
40
+
37
41
  # deprecated
38
42
  def add_form(form)
39
43
  @forms << form
@@ -47,5 +51,9 @@ module LLT
47
51
  def use(*args)
48
52
  # hook method, overwritten by Word
49
53
  end
54
+
55
+ def set_functions(*args)
56
+ # hook method
57
+ end
50
58
  end
51
59
  end
data/lib/llt/tokenizer.rb CHANGED
@@ -4,15 +4,19 @@ require 'llt/constants/abbreviations'
4
4
  require 'llt/core_extensions/array'
5
5
  require 'llt/db_handler/prometheus'
6
6
  require 'llt/helpers/metrical'
7
+ require 'llt/tokenizer/version'
8
+ require 'llt/tokenizer/version_info'
7
9
 
8
10
  module LLT
9
11
  class Tokenizer
10
12
  require 'llt/token'
11
13
  require 'llt/tokenizer/worker'
14
+ require 'llt/tokenizer/greek'
12
15
 
13
16
  include Core::Serviceable
14
17
  include Constants::Abbreviations
15
18
  include Helpers::Metrical
19
+ include Greek
16
20
 
17
21
  uses_db { DbHandler::Prometheus.new }
18
22
 
@@ -26,6 +30,8 @@ module LLT
26
30
  indexing: true,
27
31
  splitting: true,
28
32
  xml: false,
33
+ #for Greek
34
+ krasis_marker: '-'
29
35
  }
30
36
  end
31
37
 
@@ -36,6 +42,8 @@ module LLT
36
42
  setup(text, options)
37
43
 
38
44
  find_abbreviations_and_join_strings
45
+ #for Greek
46
+ split_krasis if @splitting
39
47
  split_enklitika_and_change_their_position if @splitting
40
48
  merge_what_needs_merging if @merging # quam diu => quamdiu
41
49
  tokens = create_tokens
@@ -53,11 +61,13 @@ module LLT
53
61
  @splitting = parse_option(:splitting, options)
54
62
  @indexing = parse_option(:indexing, options)
55
63
  @xml = parse_option(:xml, options)
64
+ #for Greek
65
+ @krasis_marker = parse_option(:krasis_marker, options)
56
66
  @worker = setup_worker(worker)
57
67
  @shift_range = shift_range(@shifting)
58
68
  end
59
69
 
60
- PUNCTUATION = /&(?:amp|quot|apos|lt|gt);|([\.\?,!;\-:"'”&\(\)\[\]†<>])\1*/
70
+ PUNCTUATION = /&(?:amp|quot|apos|lt|gt);|([\.\?,!;\-:"'”&\(\)\[\]†<>᾽·])\1*/
61
71
  XML_TAG = /<\/?.+?>/
62
72
 
63
73
  # This is here for two reasons:
@@ -129,7 +139,7 @@ module LLT
129
139
  arr = []
130
140
  @worker.each_with_index do |e, i|
131
141
  n = @worker[i + 1]
132
- if (n == '.' && e =~ ABBREVIATIONS) || (n == "'" && e =~ APOSTROPHE_WORDS)
142
+ if (n == '.' && e =~ ABBREVIATIONS) || (n == "'" && e =~ APOSTROPHE_WORDS) || greek_apostrophe(n,e)
133
143
  @worker[i + 1] = n.prepend(e)
134
144
  arr << (i - arr.size)
135
145
  end
@@ -141,7 +151,7 @@ module LLT
141
151
  ######################
142
152
 
143
153
  WORDS_ENDING_WITH_QUE = /^((un.{1,3})?[qc]u[aei].*que|qu[ao]que|itaque|atque|ut[er].*que|.*cumque|pler(.{1,2}|[oa]rum)que|denique|undique|usque)$/i # neque taken out!
144
- WORDS_ENDING_WITH_NE = /^(omne|sine|bene|paene|iuvene)$/i
154
+ WORDS_ENDING_WITH_NE = /^(omne|sine|bene|paene|iuvene|siccine)$/i # generalize these words and start to look for them in the db, especiialy for adverbs
145
155
  WORDS_ENDING_WITH_VE = /^(sive|neve)$/i
146
156
 
147
157
  # laetusque to -que laetus
@@ -195,7 +205,7 @@ module LLT
195
205
  def split_nec
196
206
  indices = []
197
207
  @worker.each_with_index do |token, i|
198
- if token == 'nec'
208
+ if token =~ /^nec$/i
199
209
  token.slice!(-1)
200
210
  indices << (i + indices.size + @shift_range)
201
211
  end
@@ -247,7 +257,7 @@ module LLT
247
257
  entries = []
248
258
  entries += lookup(orig_el, :noun, :nom) if orig_el =~ /io$/ # actio-ne ratio-ne
249
259
  entries += lookup(orig_el + "n", :persona, :stem) if orig_el =~ /o$/ # Plato-ne Cicero-ne Solo-ne
250
- entries += lookup(orig_el + "n", :noun, :stem, [3, 33]) if orig_el =~ /[ei]$/ # fortitudi-ne ratio-ne libidi-ne homi-ne fi-ne agmi-ne iuve-ne
260
+ entries += lookup(orig_el + "n", :noun, :stem, [3, 33]) # fortitudi-ne ratio-ne libidi-ne homi-ne fi-ne agmi-ne iuve-ne ig-ne
251
261
  entries += lookup(orig_el + "n", :noun, :stem, 2) # domi-ne
252
262
  entries += lookup(orig_el + "n", :adjective, :stem, [1,3]) # communis commune, or bonus
253
263
 
@@ -319,7 +329,6 @@ module LLT
319
329
  end
320
330
  end
321
331
 
322
-
323
332
  ######################
324
333
 
325
334
  MERGE_WORDS = [ %w{ quam diu }, ['non', /null.{1,4}$/] ]
@@ -5,6 +5,7 @@ require 'llt/core/api'
5
5
 
6
6
  class Api < Sinatra::Base
7
7
  register Sinatra::RespondWith
8
+ register LLT::Core::Api::VersionRoutes
8
9
  helpers LLT::Core::Api::Helpers
9
10
 
10
11
  get '/tokenize' do
@@ -17,4 +18,6 @@ class Api < Sinatra::Base
17
18
  f.xml { to_xml(tokens, params) }
18
19
  end
19
20
  end
21
+
22
+ add_version_route_for('/tokenize', dependencies: %i{ Core Tokenizer })
20
23
  end
@@ -0,0 +1,69 @@
1
+ module LLT
2
+ class Tokenizer
3
+ module Greek
4
+ PLAIN_VOWELS = %w(α ε ι η ο υ ω)
5
+ VOWELS_WITH_ACUTE = %w(ά έ ή ί ó ύ ώ)
6
+ VOWELS_WITH_GRAVE = %w(ὰ ὲ ὴ ì ò ὺ ὼ)
7
+ VOWELS_WITH_CIRCUMFLEX = %w(ᾶ ῆ ῖ ῦ ῶ)
8
+ VOWELS_WITH_IOTA = %w(ᾲ ᾳ ᾴ ᾷ ῂ ῃ ῄ ῇ ῲ ῳ ῴ ῷ)
9
+ CONSONANTS = %w(β γ δ ζ θ κ λ μ ν ξ π ρ ῥ ῤ σ ς τ φ χ ψ)
10
+ VOWELS = [PLAIN_VOWELS,
11
+ VOWELS_WITH_ACUTE,
12
+ VOWELS_WITH_GRAVE,
13
+ VOWELS_WITH_CIRCUMFLEX,
14
+ VOWELS_WITH_IOTA
15
+ ].flatten
16
+
17
+ SPIRITUS_LENIS = %w(ἀ ἐ ἠ ἰ ὀ ὐ ὠ)
18
+ SPIRITUS_LENIS_WITH_GRAVE = %w(ἂ ἒ ἲ ἢ ὂ ὒ ὢ)
19
+ SPIRITUS_LENIS_WITH_ACUTE = %w(ἄ ἔ ἴ ἤ ὄ ὔ ὤ)
20
+ SPIRITUS_LENIS_WITH_CIRCUMFLEX = %w(ἆ ἶ ἦ ὖ ὦ )
21
+
22
+ SPIRITUS_ASPER = %w(ἁ ἑ ἡ ἱ ὁ ὑ ὡ)
23
+ SPIRITUS_ASPER_WITH_GRAVE = %w(ἃ ἣ ἓ ἳ ὃ ὓ ὣ)
24
+ SPIRITUS_ASPER_WITH_ACUTE = %w(ἅ ἥ ἕ ἵ ὅ ὕ ὥ)
25
+ SPIRITUS_ASPER_WITH_CIRCUMFLEX = %w(ἇ ἷ ἧ ὗ ὧ)
26
+
27
+ SPIRITUS_WITH_IOTA = %w(ᾀ ᾁ ᾂ ᾃ ᾄ ᾅ ᾆ ᾇ ᾐ ᾑ ᾒ ᾓ ᾔ ᾕ ᾖ ᾗ ᾠ ᾡ ᾢ ᾣ ᾤ ᾥ ᾦ ᾧ)
28
+
29
+ VOWELS_WITH_SPIRITUS = [
30
+ SPIRITUS_LENIS,
31
+ SPIRITUS_LENIS_WITH_ACUTE,
32
+ SPIRITUS_LENIS_WITH_GRAVE,
33
+ SPIRITUS_LENIS_WITH_CIRCUMFLEX,
34
+ SPIRITUS_ASPER,
35
+ SPIRITUS_ASPER_WITH_ACUTE,
36
+ SPIRITUS_ASPER_WITH_GRAVE,
37
+ SPIRITUS_ASPER_WITH_CIRCUMFLEX,
38
+ SPIRITUS_WITH_IOTA
39
+ ].flatten
40
+
41
+ STARTING_VOWELS = Regexp.union(VOWELS_WITH_SPIRITUS)
42
+ CONS = Regexp.union(CONSONANTS)
43
+ ALL = Regexp.union([CONSONANTS, VOWELS].flatten)
44
+
45
+ def split_krasis
46
+ @worker.each_with_index do |token, i|
47
+ if resolved_krasis = contains_krasis(token)
48
+ @worker[i] = resolved_krasis
49
+ @worker.flatten!
50
+ end
51
+ end
52
+ end
53
+
54
+ def krasis(token)
55
+ "#{token}#{@krasis_marker}"
56
+ end
57
+
58
+ def contains_krasis(token)
59
+ if token.match(/^(#{CONS})(#{PLAIN_VOWELS}?#{STARTING_VOWELS})(#{ALL}*)$/)
60
+ [krasis($1), $2+$3]
61
+ end
62
+ end
63
+
64
+ def greek_apostrophe(n, e)
65
+ (n == "᾽" && e =~ CONS)
66
+ end
67
+ end
68
+ end
69
+ end
@@ -1,5 +1,5 @@
1
1
  module LLT
2
2
  class Tokenizer
3
- VERSION = "0.0.5"
3
+ VERSION = "0.0.6"
4
4
  end
5
5
  end
@@ -0,0 +1,7 @@
1
+ module LLT
2
+ class Tokenizer
3
+ class VersionInfo
4
+ include Core::Versioner
5
+ end
6
+ end
7
+ end
@@ -20,7 +20,7 @@ Gem::Specification.new do |spec|
20
20
 
21
21
  spec.add_development_dependency "bundler", "~> 1.3"
22
22
  spec.add_development_dependency "rake"
23
- spec.add_development_dependency "rspec"
23
+ spec.add_development_dependency "rspec", "2.14"
24
24
  spec.add_development_dependency "simplecov", "~> 0.7"
25
25
  spec.add_dependency "array_scanner"
26
26
  spec.add_dependency "llt-core"
@@ -0,0 +1,25 @@
1
+ require 'spec_helper'
2
+
3
+ describe LLT::Token do
4
+ let(:token) { LLT::Token }
5
+
6
+ describe "#==" do
7
+ it "equals when two takes have the same string value" do
8
+ t1 = token.new('bene')
9
+ t2 = token.new('bene')
10
+ t1.should == t2
11
+ end
12
+
13
+ it "doesn't equal when the strings are different" do
14
+ t1 = token.new('bene')
15
+ t2 = token.new('male')
16
+ t1.should_not == t2
17
+ end
18
+
19
+ it "is case insensitive" do
20
+ t1 = token.new('bene')
21
+ t2 = token.new('Bene')
22
+ t1.should == t2
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,66 @@
1
+ require 'spec_helper'
2
+
3
+ describe LLT::Tokenizer::Greek do
4
+ let(:tokenizer) { LLT::Tokenizer.new }
5
+ let(:greek_txt) { "καὶ διὰ τῆς περὶ τὴν ἀρχαιολογίαν συγγραφῆς."}
6
+ let(:krasis) { "κἄπειτα." }
7
+ let(:double_krasis) { "κἄπειτα τῆς περὶ τὴν ἀρχαιολογίαν κἄπειτα." }
8
+ let(:diphtong) { "τοὔνομα." }
9
+
10
+ context "with greek tokens" do
11
+ describe "#tokenize" do
12
+ it "tokenizes a string" do
13
+ res = tokenizer.tokenize(greek_txt)
14
+ res.should == %w(καὶ διὰ τῆς περὶ τὴν ἀρχαιολογίαν συγγραφῆς .)
15
+ res.should have(8).items
16
+ end
17
+
18
+ describe "with a string that contains an apostrophe" do
19
+ it "returns one token to which the apostrophe is attached" do
20
+ txt = "εὖ δ᾽ ἴστε."
21
+ res = tokenizer.tokenize(txt)
22
+ res.should == %w(εὖ δ᾽ ἴστε .)
23
+ res.should have(4).items
24
+ end
25
+
26
+ it "splits two tokens combined by an apostrophe" do
27
+ txt = "εὖ δ᾽ἴστε."
28
+ res = tokenizer.tokenize(txt)
29
+ res.should == %w(εὖ δ᾽ ἴστε .)
30
+ res.should have(4).items
31
+ end
32
+ end
33
+ end
34
+
35
+ describe "handles krasis" do
36
+ it "splits a krasis into two words" do
37
+ res = tokenizer.tokenize(krasis)
38
+ res.should have(3).items
39
+ res.should == %w( κ- ἄπειτα . )
40
+ end
41
+
42
+ it "handles a dipthong krasis" do
43
+ res = tokenizer.tokenize(diphtong)
44
+ res.should have(3).items
45
+ end
46
+
47
+ it "splits two kraseis in a sentence" do
48
+ res = tokenizer.tokenize(double_krasis)
49
+ res.should have(9).items
50
+ res[2].should == "τῆς"
51
+ res[8].should == "."
52
+ end
53
+
54
+ context "with options" do
55
+ context "with disabled splitting" do
56
+ it "doesn't split krasis" do
57
+ txt = 'κἄπειτα.'
58
+ opts = { splitting: false }
59
+ tokens = tokenizer.tokenize(txt, opts).map(&:to_s)
60
+ tokens.should == %w{ κἄπειτα . }
61
+ end
62
+ end
63
+ end
64
+ end
65
+ end
66
+ end
@@ -158,7 +158,9 @@ describe LLT::Tokenizer do
158
158
  "ad eamque" => "-que ad eam",
159
159
  "ob easque" => "-que ob eas",
160
160
  "neque" => "-que ne",
161
+ "Neque" => "-que Ne",
161
162
  "nec" => "-c ne",
163
+ "Nec" => "-c Ne",
162
164
  "Atque" => "Atque",
163
165
  "atque" => "atque",
164
166
  "cuiusque" => "cuiusque",
@@ -208,6 +210,7 @@ describe LLT::Tokenizer do
208
210
  "fine" => "fine",
209
211
  "iuvene" => "iuvene",
210
212
  "sanguine" => "sanguine",
213
+ "igne" => "igne",
211
214
 
212
215
  # frequent patterns in third declension adjective
213
216
  "commune" => "commune",
@@ -282,7 +285,7 @@ describe LLT::Tokenizer do
282
285
  "Word" => %w{ ita Marcus quoque -que po' },
283
286
  "Filler" => %w{ M. Sex. App. Ap. Tib. Ti. C. a. d. Kal. Ian. }, #I XI MMC }
284
287
  "XmlTag" => %w{ <grc> </grc> },
285
- "Punctuation" => %w{ , . ! ? † ( ) [ ] ... -- ” " ' & < > &amp; &lt; &gt; &apos; &quot; }
288
+ "Punctuation" => %w{ , . ! ? † ( ) [ ] ... -- ” " ' ᾽ · & < > &amp; &lt; &gt; &apos; &quot; }
286
289
  }
287
290
 
288
291
  examples.each do |klass, elements|
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llt-tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - LFDM
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-02-11 00:00:00.000000000 Z
11
+ date: 2014-08-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -42,16 +42,16 @@ dependencies:
42
42
  name: rspec
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ">="
45
+ - - '='
46
46
  - !ruby/object:Gem::Version
47
- version: '0'
47
+ version: '2.14'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - ">="
52
+ - - '='
53
53
  - !ruby/object:Gem::Version
54
- version: '0'
54
+ version: '2.14'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: simplecov
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -158,11 +158,15 @@ files:
158
158
  - lib/llt/token/xml_tag.rb
159
159
  - lib/llt/tokenizer.rb
160
160
  - lib/llt/tokenizer/api.rb
161
+ - lib/llt/tokenizer/greek.rb
161
162
  - lib/llt/tokenizer/version.rb
163
+ - lib/llt/tokenizer/version_info.rb
162
164
  - lib/llt/tokenizer/worker.rb
163
165
  - llt-tokenizer.gemspec
164
166
  - spec/lib/llt/token/punctuation_spec.rb
167
+ - spec/lib/llt/token_spec.rb
165
168
  - spec/lib/llt/tokenizer/api_spec.rb
169
+ - spec/lib/llt/tokenizer/greek_spec.rb
166
170
  - spec/lib/llt/tokenizer_spec.rb
167
171
  - spec/spec_helper.rb
168
172
  - spec/support/matchers/tokenizer.rb
@@ -192,7 +196,9 @@ specification_version: 4
192
196
  summary: Breaks latin sentences into tokens
193
197
  test_files:
194
198
  - spec/lib/llt/token/punctuation_spec.rb
199
+ - spec/lib/llt/token_spec.rb
195
200
  - spec/lib/llt/tokenizer/api_spec.rb
201
+ - spec/lib/llt/tokenizer/greek_spec.rb
196
202
  - spec/lib/llt/tokenizer_spec.rb
197
203
  - spec/spec_helper.rb
198
204
  - spec/support/matchers/tokenizer.rb