llt-tokenizer 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 17d75e05ecdf64cd93e8de239e57652b4a6375ba
4
- data.tar.gz: b21ab0737044b952a9642a68a9aae48a4d615d4e
3
+ metadata.gz: 9a9abfc5e79b148f497749053c8ccfa7ac9653af
4
+ data.tar.gz: 1c9fe20eb2824eccc1840602beae6552415eb5d2
5
5
  SHA512:
6
- metadata.gz: afadf19b6eb1a7f45dca79631efa57ceb7359d286523d2979658208decffea5d094fdcd4fcc9b3e02f788390bd3e2eeb28c415f3ad7ea97efcdbdf28602c7d2d
7
- data.tar.gz: f752b02680802aafb6305490b139dc0b9ca2e3e35e9b48ca2610f2039e57959571ac432e2b7fa3b4b9e1f05abdc54c9ef8e99f42a2927a2ee25294ecad908fd9
6
+ metadata.gz: 3cd367d754d75f895240c709aed9697140c8359490bc634e56f118b77cc015c2a08c80d7fa4fa74448084844beec4749a7b01b1789c0805a3a5a8fa8d465d5e9
7
+ data.tar.gz: 21c50a75955cab805fb81bc1435963e047171936c015981121de1405378fb4af9c21a69153c0c043d3a504986e1022437690cedb60b88e0b8246ca6fce20565b
data/.travis.yml CHANGED
@@ -4,4 +4,4 @@ before_script:
4
4
  rvm:
5
5
  - 2.1.0
6
6
  - 2.0.0
7
- - jruby-20mode
7
+ - jruby-1.7.8
data/Gemfile CHANGED
@@ -5,15 +5,15 @@ gemspec
5
5
 
6
6
  gem 'coveralls', require: false
7
7
 
8
- gem 'llt-core', git: 'git@github.com:latin-language-toolkit/llt-core.git'
9
- gem 'llt-core_extensions', git: 'git@github.com:latin-language-toolkit/llt-core_extensions.git'
10
- gem 'llt-constants', git: 'git@github.com:latin-language-toolkit/llt-constants.git'
11
- gem 'llt-db_handler', git: 'git@github.com:latin-language-toolkit/llt-db_handler.git'
12
- gem 'llt-db_handler-stub', git: 'git@github.com:latin-language-toolkit/llt-db_handler-stub.git'
13
- gem 'llt-helpers', git: 'git@github.com:latin-language-toolkit/llt-helpers.git'
8
+ gem 'llt-core', git: 'git://github.com/latin-language-toolkit/llt-core.git'
9
+ gem 'llt-core_extensions', git: 'git://github.com/latin-language-toolkit/llt-core_extensions.git'
10
+ gem 'llt-constants', git: 'git://github.com/latin-language-toolkit/llt-constants.git'
11
+ gem 'llt-db_handler', git: 'git://github.com/latin-language-toolkit/llt-db_handler.git'
12
+ gem 'llt-db_handler-stub', git: 'git://github.com/latin-language-toolkit/llt-db_handler-stub.git'
13
+ gem 'llt-helpers', git: 'git://github.com/latin-language-toolkit/llt-helpers.git'
14
14
 
15
15
  # Dependencies of db_handler
16
- gem 'llt-form_builder', git: 'git@github.com:latin-language-toolkit/llt-form_builder.git'
16
+ gem 'llt-form_builder', git: 'git://github.com/latin-language-toolkit/llt-form_builder.git'
17
17
 
18
18
  platform :ruby do
19
19
  gem 'pg'
@@ -23,5 +23,3 @@ platform :jruby do
23
23
  gem 'activerecord-jdbcpostgresql-adapter'
24
24
  gem 'jruby-httpclient'
25
25
  end
26
-
27
- gem 'pry'
data/README.md CHANGED
@@ -1,5 +1,11 @@
1
1
  # LLT::Tokenizer
2
2
 
3
+ [![Version](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/badge_fury.png)](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/badge_fury)
4
+ [![Dependencies](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/gemnasium.png)](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/gemnasium)
5
+ [![Build Status](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/travis.png)](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/travis)
6
+ [![Coverage](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/coveralls.png)](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/coveralls)
7
+ [![Code Climate](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/code_climate.png)](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/code_climate)
8
+
3
9
  Flexible service to tokenize Latin texts.
4
10
 
5
11
  ## Installation
data/lib/llt/token.rb CHANGED
@@ -34,6 +34,10 @@ module LLT
34
34
  @special_roles += roles
35
35
  end
36
36
 
37
+ def ==(other)
38
+ to_s.downcase == other.to_s.downcase
39
+ end
40
+
37
41
  # deprecated
38
42
  def add_form(form)
39
43
  @forms << form
@@ -47,5 +51,9 @@ module LLT
47
51
  def use(*args)
48
52
  # hook method, overwritten by Word
49
53
  end
54
+
55
+ def set_functions(*args)
56
+ # hook method
57
+ end
50
58
  end
51
59
  end
data/lib/llt/tokenizer.rb CHANGED
@@ -4,15 +4,19 @@ require 'llt/constants/abbreviations'
4
4
  require 'llt/core_extensions/array'
5
5
  require 'llt/db_handler/prometheus'
6
6
  require 'llt/helpers/metrical'
7
+ require 'llt/tokenizer/version'
8
+ require 'llt/tokenizer/version_info'
7
9
 
8
10
  module LLT
9
11
  class Tokenizer
10
12
  require 'llt/token'
11
13
  require 'llt/tokenizer/worker'
14
+ require 'llt/tokenizer/greek'
12
15
 
13
16
  include Core::Serviceable
14
17
  include Constants::Abbreviations
15
18
  include Helpers::Metrical
19
+ include Greek
16
20
 
17
21
  uses_db { DbHandler::Prometheus.new }
18
22
 
@@ -26,6 +30,8 @@ module LLT
26
30
  indexing: true,
27
31
  splitting: true,
28
32
  xml: false,
33
+ #for Greek
34
+ krasis_marker: '-'
29
35
  }
30
36
  end
31
37
 
@@ -36,6 +42,8 @@ module LLT
36
42
  setup(text, options)
37
43
 
38
44
  find_abbreviations_and_join_strings
45
+ #for Greek
46
+ split_krasis if @splitting
39
47
  split_enklitika_and_change_their_position if @splitting
40
48
  merge_what_needs_merging if @merging # quam diu => quamdiu
41
49
  tokens = create_tokens
@@ -53,11 +61,13 @@ module LLT
53
61
  @splitting = parse_option(:splitting, options)
54
62
  @indexing = parse_option(:indexing, options)
55
63
  @xml = parse_option(:xml, options)
64
+ #for Greek
65
+ @krasis_marker = parse_option(:krasis_marker, options)
56
66
  @worker = setup_worker(worker)
57
67
  @shift_range = shift_range(@shifting)
58
68
  end
59
69
 
60
- PUNCTUATION = /&(?:amp|quot|apos|lt|gt);|([\.\?,!;\-:"'”&\(\)\[\]†<>])\1*/
70
+ PUNCTUATION = /&(?:amp|quot|apos|lt|gt);|([\.\?,!;\-:"'”&\(\)\[\]†<>᾽·])\1*/
61
71
  XML_TAG = /<\/?.+?>/
62
72
 
63
73
  # This is here for two reasons:
@@ -129,7 +139,7 @@ module LLT
129
139
  arr = []
130
140
  @worker.each_with_index do |e, i|
131
141
  n = @worker[i + 1]
132
- if (n == '.' && e =~ ABBREVIATIONS) || (n == "'" && e =~ APOSTROPHE_WORDS)
142
+ if (n == '.' && e =~ ABBREVIATIONS) || (n == "'" && e =~ APOSTROPHE_WORDS) || greek_apostrophe(n,e)
133
143
  @worker[i + 1] = n.prepend(e)
134
144
  arr << (i - arr.size)
135
145
  end
@@ -141,7 +151,7 @@ module LLT
141
151
  ######################
142
152
 
143
153
  WORDS_ENDING_WITH_QUE = /^((un.{1,3})?[qc]u[aei].*que|qu[ao]que|itaque|atque|ut[er].*que|.*cumque|pler(.{1,2}|[oa]rum)que|denique|undique|usque)$/i # neque taken out!
144
- WORDS_ENDING_WITH_NE = /^(omne|sine|bene|paene|iuvene)$/i
154
+ WORDS_ENDING_WITH_NE = /^(omne|sine|bene|paene|iuvene|siccine)$/i # generalize these words and start to look for them in the db, especiialy for adverbs
145
155
  WORDS_ENDING_WITH_VE = /^(sive|neve)$/i
146
156
 
147
157
  # laetusque to -que laetus
@@ -195,7 +205,7 @@ module LLT
195
205
  def split_nec
196
206
  indices = []
197
207
  @worker.each_with_index do |token, i|
198
- if token == 'nec'
208
+ if token =~ /^nec$/i
199
209
  token.slice!(-1)
200
210
  indices << (i + indices.size + @shift_range)
201
211
  end
@@ -247,7 +257,7 @@ module LLT
247
257
  entries = []
248
258
  entries += lookup(orig_el, :noun, :nom) if orig_el =~ /io$/ # actio-ne ratio-ne
249
259
  entries += lookup(orig_el + "n", :persona, :stem) if orig_el =~ /o$/ # Plato-ne Cicero-ne Solo-ne
250
- entries += lookup(orig_el + "n", :noun, :stem, [3, 33]) if orig_el =~ /[ei]$/ # fortitudi-ne ratio-ne libidi-ne homi-ne fi-ne agmi-ne iuve-ne
260
+ entries += lookup(orig_el + "n", :noun, :stem, [3, 33]) # fortitudi-ne ratio-ne libidi-ne homi-ne fi-ne agmi-ne iuve-ne ig-ne
251
261
  entries += lookup(orig_el + "n", :noun, :stem, 2) # domi-ne
252
262
  entries += lookup(orig_el + "n", :adjective, :stem, [1,3]) # communis commune, or bonus
253
263
 
@@ -319,7 +329,6 @@ module LLT
319
329
  end
320
330
  end
321
331
 
322
-
323
332
  ######################
324
333
 
325
334
  MERGE_WORDS = [ %w{ quam diu }, ['non', /null.{1,4}$/] ]
@@ -5,6 +5,7 @@ require 'llt/core/api'
5
5
 
6
6
  class Api < Sinatra::Base
7
7
  register Sinatra::RespondWith
8
+ register LLT::Core::Api::VersionRoutes
8
9
  helpers LLT::Core::Api::Helpers
9
10
 
10
11
  get '/tokenize' do
@@ -17,4 +18,6 @@ class Api < Sinatra::Base
17
18
  f.xml { to_xml(tokens, params) }
18
19
  end
19
20
  end
21
+
22
+ add_version_route_for('/tokenize', dependencies: %i{ Core Tokenizer })
20
23
  end
@@ -0,0 +1,69 @@
1
+ module LLT
2
+ class Tokenizer
3
+ module Greek
4
+ PLAIN_VOWELS = %w(α ε ι η ο υ ω)
5
+ VOWELS_WITH_ACUTE = %w(ά έ ή ί ó ύ ώ)
6
+ VOWELS_WITH_GRAVE = %w(ὰ ὲ ὴ ì ò ὺ ὼ)
7
+ VOWELS_WITH_CIRCUMFLEX = %w(ᾶ ῆ ῖ ῦ ῶ)
8
+ VOWELS_WITH_IOTA = %w(ᾲ ᾳ ᾴ ᾷ ῂ ῃ ῄ ῇ ῲ ῳ ῴ ῷ)
9
+ CONSONANTS = %w(β γ δ ζ θ κ λ μ ν ξ π ρ ῥ ῤ σ ς τ φ χ ψ)
10
+ VOWELS = [PLAIN_VOWELS,
11
+ VOWELS_WITH_ACUTE,
12
+ VOWELS_WITH_GRAVE,
13
+ VOWELS_WITH_CIRCUMFLEX,
14
+ VOWELS_WITH_IOTA
15
+ ].flatten
16
+
17
+ SPIRITUS_LENIS = %w(ἀ ἐ ἠ ἰ ὀ ὐ ὠ)
18
+ SPIRITUS_LENIS_WITH_GRAVE = %w(ἂ ἒ ἲ ἢ ὂ ὒ ὢ)
19
+ SPIRITUS_LENIS_WITH_ACUTE = %w(ἄ ἔ ἴ ἤ ὄ ὔ ὤ)
20
+ SPIRITUS_LENIS_WITH_CIRCUMFLEX = %w(ἆ ἶ ἦ ὖ ὦ )
21
+
22
+ SPIRITUS_ASPER = %w(ἁ ἑ ἡ ἱ ὁ ὑ ὡ)
23
+ SPIRITUS_ASPER_WITH_GRAVE = %w(ἃ ἣ ἓ ἳ ὃ ὓ ὣ)
24
+ SPIRITUS_ASPER_WITH_ACUTE = %w(ἅ ἥ ἕ ἵ ὅ ὕ ὥ)
25
+ SPIRITUS_ASPER_WITH_CIRCUMFLEX = %w(ἇ ἷ ἧ ὗ ὧ)
26
+
27
+ SPIRITUS_WITH_IOTA = %w(ᾀ ᾁ ᾂ ᾃ ᾄ ᾅ ᾆ ᾇ ᾐ ᾑ ᾒ ᾓ ᾔ ᾕ ᾖ ᾗ ᾠ ᾡ ᾢ ᾣ ᾤ ᾥ ᾦ ᾧ)
28
+
29
+ VOWELS_WITH_SPIRITUS = [
30
+ SPIRITUS_LENIS,
31
+ SPIRITUS_LENIS_WITH_ACUTE,
32
+ SPIRITUS_LENIS_WITH_GRAVE,
33
+ SPIRITUS_LENIS_WITH_CIRCUMFLEX,
34
+ SPIRITUS_ASPER,
35
+ SPIRITUS_ASPER_WITH_ACUTE,
36
+ SPIRITUS_ASPER_WITH_GRAVE,
37
+ SPIRITUS_ASPER_WITH_CIRCUMFLEX,
38
+ SPIRITUS_WITH_IOTA
39
+ ].flatten
40
+
41
+ STARTING_VOWELS = Regexp.union(VOWELS_WITH_SPIRITUS)
42
+ CONS = Regexp.union(CONSONANTS)
43
+ ALL = Regexp.union([CONSONANTS, VOWELS].flatten)
44
+
45
+ def split_krasis
46
+ @worker.each_with_index do |token, i|
47
+ if resolved_krasis = contains_krasis(token)
48
+ @worker[i] = resolved_krasis
49
+ @worker.flatten!
50
+ end
51
+ end
52
+ end
53
+
54
+ def krasis(token)
55
+ "#{token}#{@krasis_marker}"
56
+ end
57
+
58
+ def contains_krasis(token)
59
+ if token.match(/^(#{CONS})(#{PLAIN_VOWELS}?#{STARTING_VOWELS})(#{ALL}*)$/)
60
+ [krasis($1), $2+$3]
61
+ end
62
+ end
63
+
64
+ def greek_apostrophe(n, e)
65
+ (n == "᾽" && e =~ CONS)
66
+ end
67
+ end
68
+ end
69
+ end
@@ -1,5 +1,5 @@
1
1
  module LLT
2
2
  class Tokenizer
3
- VERSION = "0.0.5"
3
+ VERSION = "0.0.6"
4
4
  end
5
5
  end
@@ -0,0 +1,7 @@
1
+ module LLT
2
+ class Tokenizer
3
+ class VersionInfo
4
+ include Core::Versioner
5
+ end
6
+ end
7
+ end
@@ -20,7 +20,7 @@ Gem::Specification.new do |spec|
20
20
 
21
21
  spec.add_development_dependency "bundler", "~> 1.3"
22
22
  spec.add_development_dependency "rake"
23
- spec.add_development_dependency "rspec"
23
+ spec.add_development_dependency "rspec", "2.14"
24
24
  spec.add_development_dependency "simplecov", "~> 0.7"
25
25
  spec.add_dependency "array_scanner"
26
26
  spec.add_dependency "llt-core"
@@ -0,0 +1,25 @@
1
+ require 'spec_helper'
2
+
3
+ describe LLT::Token do
4
+ let(:token) { LLT::Token }
5
+
6
+ describe "#==" do
7
+ it "equals when two takes have the same string value" do
8
+ t1 = token.new('bene')
9
+ t2 = token.new('bene')
10
+ t1.should == t2
11
+ end
12
+
13
+ it "doesn't equal when the strings are different" do
14
+ t1 = token.new('bene')
15
+ t2 = token.new('male')
16
+ t1.should_not == t2
17
+ end
18
+
19
+ it "is case insensitive" do
20
+ t1 = token.new('bene')
21
+ t2 = token.new('Bene')
22
+ t1.should == t2
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,66 @@
1
+ require 'spec_helper'
2
+
3
+ describe LLT::Tokenizer::Greek do
4
+ let(:tokenizer) { LLT::Tokenizer.new }
5
+ let(:greek_txt) { "καὶ διὰ τῆς περὶ τὴν ἀρχαιολογίαν συγγραφῆς."}
6
+ let(:krasis) { "κἄπειτα." }
7
+ let(:double_krasis) { "κἄπειτα τῆς περὶ τὴν ἀρχαιολογίαν κἄπειτα." }
8
+ let(:diphtong) { "τοὔνομα." }
9
+
10
+ context "with greek tokens" do
11
+ describe "#tokenize" do
12
+ it "tokenizes a string" do
13
+ res = tokenizer.tokenize(greek_txt)
14
+ res.should == %w(καὶ διὰ τῆς περὶ τὴν ἀρχαιολογίαν συγγραφῆς .)
15
+ res.should have(8).items
16
+ end
17
+
18
+ describe "with a string that contains an apostrophe" do
19
+ it "returns one token to which the apostrophe is attached" do
20
+ txt = "εὖ δ᾽ ἴστε."
21
+ res = tokenizer.tokenize(txt)
22
+ res.should == %w(εὖ δ᾽ ἴστε .)
23
+ res.should have(4).items
24
+ end
25
+
26
+ it "splits two tokens combined by an apostrophe" do
27
+ txt = "εὖ δ᾽ἴστε."
28
+ res = tokenizer.tokenize(txt)
29
+ res.should == %w(εὖ δ᾽ ἴστε .)
30
+ res.should have(4).items
31
+ end
32
+ end
33
+ end
34
+
35
+ describe "handles krasis" do
36
+ it "splits a krasis into two words" do
37
+ res = tokenizer.tokenize(krasis)
38
+ res.should have(3).items
39
+ res.should == %w( κ- ἄπειτα . )
40
+ end
41
+
42
+ it "handles a dipthong krasis" do
43
+ res = tokenizer.tokenize(diphtong)
44
+ res.should have(3).items
45
+ end
46
+
47
+ it "splits two kraseis in a sentence" do
48
+ res = tokenizer.tokenize(double_krasis)
49
+ res.should have(9).items
50
+ res[2].should == "τῆς"
51
+ res[8].should == "."
52
+ end
53
+
54
+ context "with options" do
55
+ context "with disabled splitting" do
56
+ it "doesn't split krasis" do
57
+ txt = 'κἄπειτα.'
58
+ opts = { splitting: false }
59
+ tokens = tokenizer.tokenize(txt, opts).map(&:to_s)
60
+ tokens.should == %w{ κἄπειτα . }
61
+ end
62
+ end
63
+ end
64
+ end
65
+ end
66
+ end
@@ -158,7 +158,9 @@ describe LLT::Tokenizer do
158
158
  "ad eamque" => "-que ad eam",
159
159
  "ob easque" => "-que ob eas",
160
160
  "neque" => "-que ne",
161
+ "Neque" => "-que Ne",
161
162
  "nec" => "-c ne",
163
+ "Nec" => "-c Ne",
162
164
  "Atque" => "Atque",
163
165
  "atque" => "atque",
164
166
  "cuiusque" => "cuiusque",
@@ -208,6 +210,7 @@ describe LLT::Tokenizer do
208
210
  "fine" => "fine",
209
211
  "iuvene" => "iuvene",
210
212
  "sanguine" => "sanguine",
213
+ "igne" => "igne",
211
214
 
212
215
  # frequent patterns in third declension adjective
213
216
  "commune" => "commune",
@@ -282,7 +285,7 @@ describe LLT::Tokenizer do
282
285
  "Word" => %w{ ita Marcus quoque -que po' },
283
286
  "Filler" => %w{ M. Sex. App. Ap. Tib. Ti. C. a. d. Kal. Ian. }, #I XI MMC }
284
287
  "XmlTag" => %w{ <grc> </grc> },
285
- "Punctuation" => %w{ , . ! ? † ( ) [ ] ... -- ” " ' & < > &amp; &lt; &gt; &apos; &quot; }
288
+ "Punctuation" => %w{ , . ! ? † ( ) [ ] ... -- ” " ' ᾽ · & < > &amp; &lt; &gt; &apos; &quot; }
286
289
  }
287
290
 
288
291
  examples.each do |klass, elements|
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llt-tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - LFDM
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-02-11 00:00:00.000000000 Z
11
+ date: 2014-08-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -42,16 +42,16 @@ dependencies:
42
42
  name: rspec
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ">="
45
+ - - '='
46
46
  - !ruby/object:Gem::Version
47
- version: '0'
47
+ version: '2.14'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - ">="
52
+ - - '='
53
53
  - !ruby/object:Gem::Version
54
- version: '0'
54
+ version: '2.14'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: simplecov
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -158,11 +158,15 @@ files:
158
158
  - lib/llt/token/xml_tag.rb
159
159
  - lib/llt/tokenizer.rb
160
160
  - lib/llt/tokenizer/api.rb
161
+ - lib/llt/tokenizer/greek.rb
161
162
  - lib/llt/tokenizer/version.rb
163
+ - lib/llt/tokenizer/version_info.rb
162
164
  - lib/llt/tokenizer/worker.rb
163
165
  - llt-tokenizer.gemspec
164
166
  - spec/lib/llt/token/punctuation_spec.rb
167
+ - spec/lib/llt/token_spec.rb
165
168
  - spec/lib/llt/tokenizer/api_spec.rb
169
+ - spec/lib/llt/tokenizer/greek_spec.rb
166
170
  - spec/lib/llt/tokenizer_spec.rb
167
171
  - spec/spec_helper.rb
168
172
  - spec/support/matchers/tokenizer.rb
@@ -192,7 +196,9 @@ specification_version: 4
192
196
  summary: Breaks latin sentences into tokens
193
197
  test_files:
194
198
  - spec/lib/llt/token/punctuation_spec.rb
199
+ - spec/lib/llt/token_spec.rb
195
200
  - spec/lib/llt/tokenizer/api_spec.rb
201
+ - spec/lib/llt/tokenizer/greek_spec.rb
196
202
  - spec/lib/llt/tokenizer_spec.rb
197
203
  - spec/spec_helper.rb
198
204
  - spec/support/matchers/tokenizer.rb