llt-tokenizer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c59bde34626f221dcf1880325b147dc2673c055f
4
+ data.tar.gz: f78d95a200b8dac652e0cc9a77363c57514c230d
5
+ SHA512:
6
+ metadata.gz: e1240191c6edec8d7a942504dccad5eb1aa539644ea3c852f050f6b9cea2b91fb76c6c2a310fa076c17aafdd9a8d6ed5ed14a8e355b8bb8be88faf8ab564f0b8
7
+ data.tar.gz: 65469d5164e9960c21608bafa694e907ad5ae25d3c281f2fad22b94ebbd6db80cb3789d2a66e4748d7c71021c8dd79194f456a848690bed5363afda60af4f03f
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,7 @@
1
+ language: ruby
2
+ before_script:
3
+ - 'export JRUBY_OPTS=--2.0'
4
+ rvm:
5
+ - 2.1.0
6
+ - 2.0.0
7
+ - jruby-20mode
data/Gemfile ADDED
@@ -0,0 +1,27 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in llt-tokenizer.gemspec
4
+ gemspec
5
+
6
+ gem 'coveralls', require: false
7
+
8
+ gem 'llt-core', git: 'git@github.com:latin-language-toolkit/llt-core.git'
9
+ gem 'llt-core_extensions', git: 'git@github.com:latin-language-toolkit/llt-core_extensions.git'
10
+ gem 'llt-constants', git: 'git@github.com:latin-language-toolkit/llt-constants.git'
11
+ gem 'llt-db_handler', git: 'git@github.com:latin-language-toolkit/llt-db_handler.git'
12
+ gem 'llt-helpers', git: 'git@github.com:latin-language-toolkit/llt-helpers.git'
13
+
14
+ # Dependencies of db_handler
15
+ gem 'llt-core_extensions', git: 'git@github.com:latin-language-toolkit/llt-core_extensions.git'
16
+ gem 'llt-form_builder', git: 'git@github.com:latin-language-toolkit/llt-form_builder.git'
17
+
18
+ platform :ruby do
19
+ gem 'pg'
20
+ end
21
+
22
+ platform :jruby do
23
+ gem 'activerecord-jdbcpostgresql-adapter'
24
+ gem 'jruby-httpclient'
25
+ end
26
+
27
+ gem 'pry'
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 LFDM
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,97 @@
1
+ # LLT::Tokenizer
2
+
3
+ Flexible service to tokenize Latin texts.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'llt-tokenizer'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install llt-tokenizer
18
+
19
+ ## Usage
20
+
21
+ The LLT's Tokenizer makes use of stem dictionaries. Refer to [these instructions](http://github.com/latin-language-toolkit/llt-db_handler "llt-db_handler") on how to set one up.
22
+
23
+ ```ruby
24
+ require 'llt/tokenizer'
25
+
26
+ t = LLT::Tokenizer.new
27
+ tokens = t.tokenize('Arma virumque cano.')
28
+ tokens.map(&:to_s)
29
+ # => ["Arma", "-que", "virum", "cano", "."]
30
+ ```
31
+
32
+ The Tokenizer takes several options upon creation or a call to #tokenize:
33
+
34
+ ```ruby
35
+ # shifting determines if enclitics shall be moved to
36
+ # their functional position
37
+ t = LLT::Tokenizer.new(shifting: true)
38
+ tokens = t.tokenize('In eoque arma cano.')
39
+ tokens.map(&:to_s)
40
+ # => ["-que", "In", "eo", "arma", "cano", "."]
41
+
42
+ # all options can be passed directly to #tokenize to override
43
+ # the default options
44
+ tokens = t.tokenize('In eoque arma cano.', shifting: false)
45
+ tokens.map(&:to_s)
46
+ # => ["In", "eo", "-que", "arma", "cano", "."]
47
+
48
+ # enclitics_marker takes a string, which marks up splitted enclitics
49
+ t = LLT::Tokenizer.new(enclitics_marker: '--', shifting: false)
50
+ tokens = t.tokenize('Arma virumque cano.')
51
+ tokens.map(&:to_s)
52
+ # => ["Arma", "virum", "--que", "cano", "."]
53
+
54
+ # indexing determines if each token shall receive a consecutive id
55
+ tokens = t.tokenize('Arma virumque cano.', indexing: true)
56
+ tokens.first.id # => 1
57
+ tokens = t.tokenize('Arma virumque cano.', indexing: false)
58
+ tokens.first.id # => nil
59
+
60
+ # merging enables token merging of lemmata, that often appear with
61
+ # orthographical inconsistencies
62
+ tokens = t.tokenize('Quam diu cano?', merging: true)
63
+ tokens.map(&:to_s)
64
+ # => ["Quamdiu", "cano", "?"]
65
+ ```
66
+
67
+ The returned items are instances of LLT::Token, which can be marked up
68
+ in a variety of forms:
69
+
70
+ ```ruby
71
+ t = LLT::Tokenizer.new(shifting: false, indexing: true)
72
+ tokens = t.tokenize('Arma virumque cano.')
73
+ tokens.map(&:to_xml)
74
+ # => ["<w>arma<_w>", "<w>virum<_w>", "<w>-que<_w>", "<w>cano<_w>", "<pc>.<_pc>"]
75
+ ```
76
+
77
+ Standard TEI XML markup is used: w tags for word tokens, pc tags for
78
+ punctuation. The #to_xml method is highly flexible as well, for full
79
+ coverage see _TODO_.
80
+
81
+ ```ruby
82
+ puts tokens.map { |token| token.to_xml(indexing: true) }
83
+ # <w n="1">Arma</w>
84
+ # <w n="2">virum</w>
85
+ # <w n="3">-que</w>
86
+ # <w n="4">cano</w>
87
+ # <pc n="5">.</pc>
88
+ ```
89
+
90
+
91
+ ## Contributing
92
+
93
+ 1. Fork it
94
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
95
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
96
+ 4. Push to the branch (`git push origin my-new-feature`)
97
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,31 @@
1
+ module LLT
2
+ class Token
3
+ class Filler < Token
4
+ xml_tag 'w'
5
+
6
+ def add(type)
7
+ case type
8
+ when :name then add_name_form
9
+ end
10
+ end
11
+
12
+ #def add_name_form
13
+ # @possible_forms << PersonaFiller.new(@word)
14
+ #end
15
+
16
+ # cannot hold anything atm, is therefore never really empty
17
+ def empty?
18
+ false
19
+ end
20
+ alias :no_forms? :empty?
21
+
22
+ def set_functions
23
+ [:filler]
24
+ end
25
+
26
+ def inspect
27
+ "#{"Filler token".blue}: #{@string}"
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,36 @@
1
+ module LLT
2
+ class Token
3
+ class Punctuation < Token
4
+ xml_tag 'pc'
5
+
6
+ attr_accessor :opening, :closing, :other
7
+
8
+ def initialize(string, id = nil)
9
+ super
10
+ # this is part of an old interface that is mostly unused
11
+ # some parts remain - find and delete em
12
+ @opening = false
13
+ @closing = false
14
+ @other = false
15
+ end
16
+
17
+ # cannot hold anything atm, is therefore never really empty
18
+ def empty?
19
+ false
20
+ end
21
+ alias :no_forms? :empty?
22
+
23
+ def set_functions
24
+ [:punctuation]
25
+ end
26
+
27
+ def punctuation
28
+ @string
29
+ end
30
+
31
+ def inspect
32
+ "#{"Punctuation token:".yellow} #{@string}"
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,53 @@
1
+ module LLT
2
+ class Token
3
+ class Word < Token
4
+ xml_tag 'w'
5
+
6
+ def word
7
+ @string
8
+ end
9
+
10
+ def no_forms?
11
+ @container.empty?
12
+ end
13
+
14
+ def set_functions
15
+ [:word]
16
+ end
17
+
18
+ def use(i = nil)
19
+ if i
20
+ return @container[i - 1]
21
+ elsif block_given?
22
+ @container.find { |f| yield(f) }
23
+ end
24
+ end
25
+
26
+ def inspect
27
+ "#{"Word token".green}: #{@string}\n" +
28
+ "\tForms: #{forms_to_s}\n"
29
+ end
30
+
31
+ def forms_to_s
32
+ # was each_with_index_and_object, which is currently not available
33
+ @container.each_with_index.each_with_object("") do |(f, i), str|
34
+ str << enumeration(i) << stripped_form(f)
35
+ str << delimiter unless f == @container.last
36
+ str
37
+ end
38
+ end
39
+
40
+ def stripped_form(form)
41
+ form.to_s.sub(@string, "").strip
42
+ end
43
+
44
+ def enumeration(i)
45
+ "#{i}: ".light_yellow
46
+ end
47
+
48
+ def delimiter
49
+ " | ".cyan
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,24 @@
1
+ module LLT
2
+ class Token
3
+ class XmlTag < Token
4
+ def set_functions
5
+ [:xml_tag]
6
+ end
7
+
8
+ # overrides #to_xml from Containable - the tag stays at is it
9
+ def to_xml(*args)
10
+ to_s
11
+ end
12
+
13
+ def inspect
14
+ "#{'XML tag'.blue} #{tag_status}: #{to_s}"
15
+ end
16
+
17
+ private
18
+
19
+ def tag_status
20
+ to_s.match(/\//) ? 'open' : 'close'
21
+ end
22
+ end
23
+ end
24
+ end
data/lib/llt/token.rb ADDED
@@ -0,0 +1,51 @@
1
+ require 'llt/core/containable'
2
+ require 'llt/helpers/functions'
3
+
4
+ module LLT
5
+ class Token
6
+ include Core::Containable
7
+ include Helpers::Functions
8
+ #include Phonology
9
+
10
+ require 'llt/token/word'
11
+ require 'llt/token/punctuation'
12
+ require 'llt/token/filler'
13
+ require 'llt/token/xml_tag'
14
+
15
+ attr_reader :functions, :special_roles
16
+
17
+ container_alias :forms
18
+
19
+ def initialize(string, id = nil)
20
+ super
21
+ @functions = set_functions
22
+ end
23
+
24
+ def special_roles
25
+ @special_roles || []
26
+ end
27
+
28
+ def has_special_role?(role)
29
+ special_roles.include?(role)
30
+ end
31
+
32
+ def set_special_role(*roles)
33
+ @special_roles ||= []
34
+ @special_roles += roles
35
+ end
36
+
37
+ # deprecated
38
+ def add_form(form)
39
+ @forms << form
40
+ end
41
+
42
+ # deprecated
43
+ def add_forms(forms)
44
+ @forms += forms
45
+ end
46
+
47
+ def use(*args)
48
+ # hook method, overwritten by Word
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,20 @@
1
+ require 'sinatra/base'
2
+ require 'sinatra/respond_with'
3
+ require 'llt/tokenizer'
4
+ require 'llt/core/api'
5
+
6
+ class Api < Sinatra::Base
7
+ register Sinatra::RespondWith
8
+ helpers LLT::Core::Api::Helpers
9
+
10
+ get '/tokenize' do
11
+ typecast_params!(params)
12
+ text = extract_text(params)
13
+ tokenizer = LLT::Tokenizer.new(params)
14
+ tokens = tokenizer.tokenize(text)
15
+
16
+ respond_to do |f|
17
+ f.xml { to_xml(tokens, params) }
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,5 @@
1
+ module LLT
2
+ class Tokenizer
3
+ VERSION = "0.0.1"
4
+ end
5
+ end
@@ -0,0 +1,106 @@
1
+ require 'forwardable'
2
+
3
+ module LLT
4
+ class Tokenizer
5
+ class Worker
6
+ extend Forwardable
7
+ include Enumerable
8
+ include Helpers::Metrical
9
+
10
+ def_delegators :@bare_text, :each, :[], :[]=, :insert, :delete_at,
11
+ :each_overlapping_pair, :map!
12
+
13
+ # TODO 28.11.13 11:45 by LFDM
14
+ # Edge cases?
15
+ # Merge words?
16
+
17
+ def initialize(metric_text, marker)
18
+ @metric_text = metric_text
19
+ @bare_text = metric_text.map { |token| wo_meter(token) }
20
+ @marker = marker
21
+ @marked_enclitics = ENCLITICS.map { |e| "#{@marker}#{e}"}
22
+ end
23
+
24
+ def to_a
25
+ align_metrical_text
26
+ @metric_text
27
+ end
28
+
29
+ private
30
+
31
+ # One ugly method, but we don't want to slow it down even more
32
+ def align_metrical_text
33
+ m = ArrayScanner.new(@metric_text)
34
+ b = ArrayScanner.new(@bare_text)
35
+ loop do
36
+ # metric element
37
+ x = m.scan
38
+ # bare element
39
+ y = b.scan
40
+ no_meter = wo_meter(x)
41
+
42
+ # we don't have to do anything if the dequantified metric element
43
+ # was the same as the bare element - the metric_text was right
44
+ # at this position
45
+ unless no_meter == y
46
+
47
+ # If the bare element was a marked enclitic, it must have been
48
+ # shifted. We're looking for the next metric token, that has it
49
+ # attached and try to find the string index where it starts to
50
+ # slice it of.
51
+ # Usually the metric element just scanned (y) will have it, if we
52
+ # don't find it, a double shift has occured and it should sit right
53
+ # at the current element of the metric ArrayScanner (m).
54
+ # The enclitic (sliced of x) has to be inserted one position before.
55
+ if @marked_enclitics.include?(y)
56
+ clean_encl_re = /#{y.dup.delete(@marker)}$/
57
+ unless index = no_meter =~ clean_encl_re
58
+ x = m.current
59
+ index = wo_meter(x) =~ clean_encl_re
60
+ end
61
+ insert!(slice_encl!(x, index), m.pos - 1)
62
+
63
+ # If the dequantified metric element has an enclitic attached, the
64
+ # option shifting: false must have been given. The enclitic will
65
+ # follow right after in the @bare_text, we can therefore slice and
66
+ # insert right in place (the next # scan round will reveal that
67
+ # enclitic in metric_text == enclitic in bare_text
68
+ elsif encl = ENCLITICS.find { |e| no_meter.end_with?(e) }
69
+ index = no_meter =~ /#{encl}$/
70
+ insert!(slice_encl!(x, index), m.pos)
71
+
72
+ # If the bare element has a dot attached, it must have been an
73
+ # abbreviation.
74
+ # The . will appear right afterwards in the metric text. We can
75
+ # delete it and append it to the last scanned metric element (x)
76
+ #
77
+ # We need to do the same if merge words were present.
78
+ # The last metric element was quam, the bare element is quamdiu.
79
+ # We append if the last metric element + the next metric element
80
+ # is the same as the bare element.
81
+ elsif y.end_with?('.') || merged_words_present?(no_meter, y, m)
82
+ append_from_deleted_index!(x, m.pos)
83
+ end
84
+ end
85
+ break if b.eoa?
86
+ end
87
+ end
88
+
89
+ def insert!(enclitic, position)
90
+ @metric_text.insert(position, "#{@marker}#{enclitic}")
91
+ end
92
+
93
+ def slice_encl!(token, index)
94
+ token.slice!(index..-1)
95
+ end
96
+
97
+ def append_from_deleted_index!(token, index)
98
+ token << @metric_text.delete_at(index)
99
+ end
100
+
101
+ def merged_words_present?(last_metric, last_bare, metric_arr_scanner)
102
+ (last_metric + wo_meter(metric_arr_scanner.peek)) == last_bare
103
+ end
104
+ end
105
+ end
106
+ end