RubyGems - llt-tokenizer - Versions diffs - 0.0.5 → 0.0.6 - Mend

llt-tokenizer 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +4 -4
data/.travis.yml +1 -1
data/Gemfile +7 -9
data/README.md +6 -0
data/lib/llt/token.rb +8 -0
data/lib/llt/tokenizer.rb +15 -6
data/lib/llt/tokenizer/api.rb +3 -0
data/lib/llt/tokenizer/greek.rb +69 -0
data/lib/llt/tokenizer/version.rb +1 -1
data/lib/llt/tokenizer/version_info.rb +7 -0
data/llt-tokenizer.gemspec +1 -1
data/spec/lib/llt/token_spec.rb +25 -0
data/spec/lib/llt/tokenizer/greek_spec.rb +66 -0
data/spec/lib/llt/tokenizer_spec.rb +4 -1
metadata +12 -6

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 17d75e05ecdf64cd93e8de239e57652b4a6375ba
-  data.tar.gz: b21ab0737044b952a9642a68a9aae48a4d615d4e
+  metadata.gz: 9a9abfc5e79b148f497749053c8ccfa7ac9653af
+  data.tar.gz: 1c9fe20eb2824eccc1840602beae6552415eb5d2
 SHA512:
-  metadata.gz: afadf19b6eb1a7f45dca79631efa57ceb7359d286523d2979658208decffea5d094fdcd4fcc9b3e02f788390bd3e2eeb28c415f3ad7ea97efcdbdf28602c7d2d
-  data.tar.gz: f752b02680802aafb6305490b139dc0b9ca2e3e35e9b48ca2610f2039e57959571ac432e2b7fa3b4b9e1f05abdc54c9ef8e99f42a2927a2ee25294ecad908fd9
+  metadata.gz: 3cd367d754d75f895240c709aed9697140c8359490bc634e56f118b77cc015c2a08c80d7fa4fa74448084844beec4749a7b01b1789c0805a3a5a8fa8d465d5e9
+  data.tar.gz: 21c50a75955cab805fb81bc1435963e047171936c015981121de1405378fb4af9c21a69153c0c043d3a504986e1022437690cedb60b88e0b8246ca6fce20565b

data/.travis.yml CHANGED Viewed

@@ -4,4 +4,4 @@ before_script:
 rvm:
   - 2.1.0
   - 2.0.0
-  - jruby-20mode
+  - jruby-1.7.8

data/Gemfile CHANGED Viewed

@@ -5,15 +5,15 @@ gemspec
 gem 'coveralls', require: false
-gem 'llt-core', git: 'git@github.com:latin-language-toolkit/llt-core.git'
-gem 'llt-core_extensions', git: 'git@github.com:latin-language-toolkit/llt-core_extensions.git'
-gem 'llt-constants', git: 'git@github.com:latin-language-toolkit/llt-constants.git'
-gem 'llt-db_handler', git: 'git@github.com:latin-language-toolkit/llt-db_handler.git'
-gem 'llt-db_handler-stub', git: 'git@github.com:latin-language-toolkit/llt-db_handler-stub.git'
-gem 'llt-helpers', git: 'git@github.com:latin-language-toolkit/llt-helpers.git'
+gem 'llt-core', git: 'git://github.com/latin-language-toolkit/llt-core.git'
+gem 'llt-core_extensions', git: 'git://github.com/latin-language-toolkit/llt-core_extensions.git'
+gem 'llt-constants', git: 'git://github.com/latin-language-toolkit/llt-constants.git'
+gem 'llt-db_handler', git: 'git://github.com/latin-language-toolkit/llt-db_handler.git'
+gem 'llt-db_handler-stub', git: 'git://github.com/latin-language-toolkit/llt-db_handler-stub.git'
+gem 'llt-helpers', git: 'git://github.com/latin-language-toolkit/llt-helpers.git'
 # Dependencies of db_handler
-gem 'llt-form_builder', git: 'git@github.com:latin-language-toolkit/llt-form_builder.git'
+gem 'llt-form_builder', git: 'git://github.com/latin-language-toolkit/llt-form_builder.git'
 platform :ruby do
   gem 'pg'
@@ -23,5 +23,3 @@ platform :jruby do
   gem 'activerecord-jdbcpostgresql-adapter'
   gem 'jruby-httpclient'
 end
-gem 'pry'

data/README.md CHANGED Viewed

@@ -1,5 +1,11 @@
 # LLT::Tokenizer
+[![Version](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/badge_fury.png)](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/badge_fury)
+[![Dependencies](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/gemnasium.png)](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/gemnasium)
+[![Build Status](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/travis.png)](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/travis)
+[![Coverage](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/coveralls.png)](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/coveralls)
+[![Code Climate](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/code_climate.png)](http://allthebadges.io/latin-language-toolkit/llt-tokenizer/code_climate)
 Flexible service to tokenize Latin texts.
 ## Installation

data/lib/llt/token.rb CHANGED Viewed

@@ -34,6 +34,10 @@ module LLT
       @special_roles += roles
     end
+    def ==(other)
+      to_s.downcase == other.to_s.downcase
+    end
     # deprecated
     def add_form(form)
       @forms << form
@@ -47,5 +51,9 @@ module LLT
     def use(*args)
       # hook method, overwritten by Word
     end
+    def set_functions(*args)
+      # hook method
+    end
   end
 end

data/lib/llt/tokenizer.rb CHANGED Viewed

@@ -4,15 +4,19 @@ require 'llt/constants/abbreviations'
 require 'llt/core_extensions/array'
 require 'llt/db_handler/prometheus'
 require 'llt/helpers/metrical'
+require 'llt/tokenizer/version'
+require 'llt/tokenizer/version_info'
 module LLT
   class Tokenizer
     require 'llt/token'
     require 'llt/tokenizer/worker'
+    require 'llt/tokenizer/greek'
     include Core::Serviceable
     include Constants::Abbreviations
     include Helpers::Metrical
+    include Greek
     uses_db { DbHandler::Prometheus.new }
@@ -26,6 +30,8 @@ module LLT
         indexing: true,
         splitting: true,
         xml: false,
+        #for Greek
+        krasis_marker: '-'
       }
     end
@@ -36,6 +42,8 @@ module LLT
       setup(text, options)
       find_abbreviations_and_join_strings
+      #for Greek
+      split_krasis if @splitting
       split_enklitika_and_change_their_position if @splitting
       merge_what_needs_merging if @merging # quam diu => quamdiu
       tokens = create_tokens
@@ -53,11 +61,13 @@ module LLT
       @splitting        = parse_option(:splitting, options)
       @indexing         = parse_option(:indexing, options)
       @xml              = parse_option(:xml, options)
+      #for Greek
+      @krasis_marker    = parse_option(:krasis_marker, options)
       @worker = setup_worker(worker)
       @shift_range = shift_range(@shifting)
     end
-    PUNCTUATION = /&(?:amp|quot|apos|lt|gt);|([\.\?,!;\-:"'”&\(\)\[\]†<>])\1*/
+    PUNCTUATION = /&(?:amp|quot|apos|lt|gt);|([\.\?,!;\-:"'”&\(\)\[\]†<>᾽·])\1*/
     XML_TAG = /<\/?.+?>/
     # This is here for two reasons:
@@ -129,7 +139,7 @@ module LLT
       arr = []
       @worker.each_with_index do |e, i|
         n = @worker[i + 1]
-        if (n == '.' && e =~ ABBREVIATIONS) || (n == "'" && e =~ APOSTROPHE_WORDS)
+        if (n == '.' && e =~ ABBREVIATIONS) || (n == "'" && e =~ APOSTROPHE_WORDS) || greek_apostrophe(n,e)
           @worker[i + 1] = n.prepend(e)
           arr << (i - arr.size)
         end
@@ -141,7 +151,7 @@ module LLT
   ######################
     WORDS_ENDING_WITH_QUE = /^((un.{1,3})?[qc]u[aei].*que|qu[ao]que|itaque|atque|ut[er].*que|.*cumque|pler(.{1,2}|[oa]rum)que|denique|undique|usque)$/i # neque taken out!
-    WORDS_ENDING_WITH_NE  = /^(omne|sine|bene|paene|iuvene)$/i
+    WORDS_ENDING_WITH_NE  = /^(omne|sine|bene|paene|iuvene|siccine)$/i # generalize these words and start to look for them in the db, especiialy for adverbs
     WORDS_ENDING_WITH_VE  = /^(sive|neve)$/i
     # laetusque  to -que laetus
@@ -195,7 +205,7 @@ module LLT
     def split_nec
       indices = []
       @worker.each_with_index do |token, i|
-        if token == 'nec'
+        if token =~ /^nec$/i
           token.slice!(-1)
           indices << (i + indices.size + @shift_range)
         end
@@ -247,7 +257,7 @@ module LLT
           entries = []
           entries += lookup(orig_el, :noun, :nom)           if orig_el =~ /io$/   # actio-ne ratio-ne
           entries += lookup(orig_el + "n", :persona, :stem) if orig_el =~ /o$/    # Plato-ne Cicero-ne Solo-ne
-          entries += lookup(orig_el + "n", :noun, :stem, [3, 33])    if orig_el =~ /[ei]$/ # fortitudi-ne ratio-ne libidi-ne homi-ne fi-ne agmi-ne iuve-ne
+          entries += lookup(orig_el + "n", :noun, :stem, [3, 33])  # fortitudi-ne ratio-ne libidi-ne homi-ne fi-ne agmi-ne iuve-ne ig-ne
           entries += lookup(orig_el + "n", :noun, :stem, 2)                       # domi-ne
           entries += lookup(orig_el + "n", :adjective, :stem, [1,3])              # communis commune, or bonus
@@ -319,7 +329,6 @@ module LLT
       end
     end
   ######################
     MERGE_WORDS = [ %w{ quam diu }, ['non', /null.{1,4}$/] ]

data/lib/llt/tokenizer/api.rb CHANGED Viewed

@@ -5,6 +5,7 @@ require 'llt/core/api'
 class Api < Sinatra::Base
   register Sinatra::RespondWith
+  register LLT::Core::Api::VersionRoutes
   helpers LLT::Core::Api::Helpers
   get '/tokenize' do
@@ -17,4 +18,6 @@ class Api < Sinatra::Base
       f.xml { to_xml(tokens, params) }
     end
   end
+  add_version_route_for('/tokenize', dependencies: %i{ Core Tokenizer })
 end

data/lib/llt/tokenizer/greek.rb ADDED Viewed

@@ -0,0 +1,69 @@
+module LLT
+  class Tokenizer
+    module Greek
+      PLAIN_VOWELS = %w(α ε ι η ο υ ω)
+      VOWELS_WITH_ACUTE = %w(ά έ ή ί ó ύ ώ)
+      VOWELS_WITH_GRAVE = %w(ὰ ὲ ὴ ì ò ὺ ὼ)
+      VOWELS_WITH_CIRCUMFLEX = %w(ᾶ ῆ ῖ ῦ ῶ)
+      VOWELS_WITH_IOTA = %w(ᾲ ᾳ ᾴ ᾷ ῂ ῃ ῄ ῇ ῲ ῳ ῴ ῷ)
+      CONSONANTS = %w(β γ δ ζ θ κ λ μ ν ξ π ρ ῥ ῤ σ ς τ φ χ ψ)
+      VOWELS = [PLAIN_VOWELS,
+                VOWELS_WITH_ACUTE,
+                VOWELS_WITH_GRAVE,
+                VOWELS_WITH_CIRCUMFLEX,
+                VOWELS_WITH_IOTA
+      ].flatten
+      SPIRITUS_LENIS = %w(ἀ ἐ ἠ ἰ ὀ ὐ ὠ)
+      SPIRITUS_LENIS_WITH_GRAVE = %w(ἂ ἒ ἲ ἢ ὂ ὒ ὢ)
+      SPIRITUS_LENIS_WITH_ACUTE = %w(ἄ ἔ ἴ ἤ ὄ ὔ ὤ)
+      SPIRITUS_LENIS_WITH_CIRCUMFLEX = %w(ἆ ἶ ἦ ὖ ὦ )
+      SPIRITUS_ASPER = %w(ἁ ἑ ἡ ἱ ὁ ὑ ὡ)
+      SPIRITUS_ASPER_WITH_GRAVE = %w(ἃ ἣ ἓ ἳ ὃ ὓ ὣ)
+      SPIRITUS_ASPER_WITH_ACUTE = %w(ἅ ἥ ἕ ἵ ὅ ὕ ὥ)
+      SPIRITUS_ASPER_WITH_CIRCUMFLEX = %w(ἇ ἷ ἧ ὗ ὧ)
+      SPIRITUS_WITH_IOTA = %w(ᾀ ᾁ ᾂ ᾃ ᾄ ᾅ ᾆ ᾇ ᾐ ᾑ ᾒ ᾓ ᾔ ᾕ ᾖ ᾗ ᾠ ᾡ ᾢ ᾣ ᾤ ᾥ ᾦ ᾧ)
+      VOWELS_WITH_SPIRITUS = [
+        SPIRITUS_LENIS,
+        SPIRITUS_LENIS_WITH_ACUTE,
+        SPIRITUS_LENIS_WITH_GRAVE,
+        SPIRITUS_LENIS_WITH_CIRCUMFLEX,
+        SPIRITUS_ASPER,
+        SPIRITUS_ASPER_WITH_ACUTE,
+        SPIRITUS_ASPER_WITH_GRAVE,
+        SPIRITUS_ASPER_WITH_CIRCUMFLEX,
+        SPIRITUS_WITH_IOTA
+      ].flatten
+      STARTING_VOWELS = Regexp.union(VOWELS_WITH_SPIRITUS)
+      CONS = Regexp.union(CONSONANTS)
+      ALL = Regexp.union([CONSONANTS, VOWELS].flatten)
+      def split_krasis
+        @worker.each_with_index do |token, i|
+          if resolved_krasis = contains_krasis(token)
+            @worker[i] = resolved_krasis
+            @worker.flatten!
+          end
+        end
+      end
+      def krasis(token)
+        "#{token}#{@krasis_marker}"
+      end
+      def contains_krasis(token)
+        if token.match(/^(#{CONS})(#{PLAIN_VOWELS}?#{STARTING_VOWELS})(#{ALL}*)$/)
+          [krasis($1), $2+$3]
+        end
+      end
+      def greek_apostrophe(n, e)
+        (n == "᾽" && e =~ CONS)
+      end
+    end
+  end
+end

data/lib/llt/tokenizer/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module LLT
   class Tokenizer
-    VERSION = "0.0.5"
+    VERSION = "0.0.6"
   end
 end

data/lib/llt/tokenizer/version_info.rb ADDED Viewed

@@ -0,0 +1,7 @@
+module LLT
+  class Tokenizer
+    class VersionInfo
+      include Core::Versioner
+    end
+  end
+end

data/llt-tokenizer.gemspec CHANGED Viewed

@@ -20,7 +20,7 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency "bundler", "~> 1.3"
   spec.add_development_dependency "rake"
-  spec.add_development_dependency "rspec"
+  spec.add_development_dependency "rspec", "2.14"
   spec.add_development_dependency "simplecov", "~> 0.7"
   spec.add_dependency "array_scanner"
   spec.add_dependency "llt-core"

data/spec/lib/llt/token_spec.rb ADDED Viewed

@@ -0,0 +1,25 @@
+require 'spec_helper'
+describe LLT::Token do
+  let(:token) { LLT::Token }
+  describe "#==" do
+    it "equals when two takes have the same string value" do
+      t1 = token.new('bene')
+      t2 = token.new('bene')
+      t1.should == t2
+    end
+    it "doesn't equal when the strings are different" do
+      t1 = token.new('bene')
+      t2 = token.new('male')
+      t1.should_not == t2
+    end
+    it "is case insensitive" do
+      t1 = token.new('bene')
+      t2 = token.new('Bene')
+      t1.should == t2
+    end
+  end
+end

data/spec/lib/llt/tokenizer/greek_spec.rb ADDED Viewed

@@ -0,0 +1,66 @@
+require 'spec_helper'
+describe LLT::Tokenizer::Greek do
+  let(:tokenizer) { LLT::Tokenizer.new }
+  let(:greek_txt) { "καὶ διὰ τῆς περὶ τὴν ἀρχαιολογίαν συγγραφῆς."}
+  let(:krasis) { "κἄπειτα." }
+  let(:double_krasis) { "κἄπειτα τῆς περὶ τὴν ἀρχαιολογίαν κἄπειτα." }
+  let(:diphtong) { "τοὔνομα." }
+  context "with greek tokens" do
+    describe "#tokenize" do
+      it "tokenizes a string" do
+        res = tokenizer.tokenize(greek_txt)
+        res.should == %w(καὶ διὰ τῆς περὶ τὴν ἀρχαιολογίαν συγγραφῆς .)
+        res.should have(8).items
+      end
+      describe "with a string that contains an apostrophe" do
+        it "returns one token to which the apostrophe is attached" do
+          txt = "εὖ δ᾽ ἴστε."
+          res = tokenizer.tokenize(txt)
+          res.should == %w(εὖ δ᾽ ἴστε .)
+          res.should have(4).items
+        end
+        it "splits two tokens combined by an apostrophe" do
+          txt = "εὖ δ᾽ἴστε."
+          res = tokenizer.tokenize(txt)
+          res.should == %w(εὖ δ᾽ ἴστε .)
+          res.should have(4).items
+        end
+      end
+    end
+    describe "handles krasis" do
+      it "splits a krasis into two words" do
+        res = tokenizer.tokenize(krasis)
+        res.should have(3).items
+        res.should == %w( κ- ἄπειτα . )
+      end
+      it "handles a dipthong krasis" do
+        res = tokenizer.tokenize(diphtong)
+        res.should have(3).items
+      end
+      it "splits two kraseis in a sentence" do
+        res = tokenizer.tokenize(double_krasis)
+        res.should have(9).items
+        res[2].should == "τῆς"
+        res[8].should == "."
+      end
+      context "with options" do
+        context "with disabled splitting" do
+          it "doesn't split krasis" do
+            txt = 'κἄπειτα.'
+            opts = { splitting: false }
+            tokens = tokenizer.tokenize(txt, opts).map(&:to_s)
+            tokens.should == %w{ κἄπειτα . }
+          end
+        end
+      end
+    end
+  end
+end

data/spec/lib/llt/tokenizer_spec.rb CHANGED Viewed

@@ -158,7 +158,9 @@ describe LLT::Tokenizer do
           "ad eamque" => "-que ad eam",
           "ob easque" => "-que ob eas",
           "neque"     => "-que ne",
+          "Neque"     => "-que Ne",
           "nec"       => "-c ne",
+          "Nec"       => "-c Ne",
           "Atque"     => "Atque",
           "atque"     => "atque",
           "cuiusque"  => "cuiusque",
@@ -208,6 +210,7 @@ describe LLT::Tokenizer do
           "fine"         => "fine",
           "iuvene"       => "iuvene",
           "sanguine"     => "sanguine",
+          "igne"         => "igne",
           # frequent patterns in third declension adjective
           "commune"    => "commune",
@@ -282,7 +285,7 @@ describe LLT::Tokenizer do
         "Word"     => %w{ ita Marcus quoque -que po' },
         "Filler"   => %w{ M. Sex. App. Ap. Tib. Ti. C. a. d. Kal. Ian. }, #I XI MMC }
         "XmlTag"   => %w{ <grc> </grc> },
-        "Punctuation" => %w{ , . ! ? † ( ) [ ] ... -- ” " ' & < > &amp; &lt; &gt; &apos; &quot; }
+        "Punctuation" => %w{ , . ! ? † ( ) [ ] ... -- ” " ' ᾽ · & < > &amp; &lt; &gt; &apos; &quot; }
       }
       examples.each do |klass, elements|

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llt-tokenizer
 version: !ruby/object:Gem::Version
-  version: 0.0.5
+  version: 0.0.6
 platform: ruby
 authors:
 - LFDM
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-02-11 00:00:00.000000000 Z
+date: 2014-08-11 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -42,16 +42,16 @@ dependencies:
   name: rspec
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - '='
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '2.14'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - '='
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '2.14'
 - !ruby/object:Gem::Dependency
   name: simplecov
   requirement: !ruby/object:Gem::Requirement
@@ -158,11 +158,15 @@ files:
 - lib/llt/token/xml_tag.rb
 - lib/llt/tokenizer.rb
 - lib/llt/tokenizer/api.rb
+- lib/llt/tokenizer/greek.rb
 - lib/llt/tokenizer/version.rb
+- lib/llt/tokenizer/version_info.rb
 - lib/llt/tokenizer/worker.rb
 - llt-tokenizer.gemspec
 - spec/lib/llt/token/punctuation_spec.rb
+- spec/lib/llt/token_spec.rb
 - spec/lib/llt/tokenizer/api_spec.rb
+- spec/lib/llt/tokenizer/greek_spec.rb
 - spec/lib/llt/tokenizer_spec.rb
 - spec/spec_helper.rb
 - spec/support/matchers/tokenizer.rb
@@ -192,7 +196,9 @@ specification_version: 4
 summary: Breaks latin sentences into tokens
 test_files:
 - spec/lib/llt/token/punctuation_spec.rb
+- spec/lib/llt/token_spec.rb
 - spec/lib/llt/tokenizer/api_spec.rb
+- spec/lib/llt/tokenizer/greek_spec.rb
 - spec/lib/llt/tokenizer_spec.rb
 - spec/spec_helper.rb
 - spec/support/matchers/tokenizer.rb