llt-tokenizer 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c59bde34626f221dcf1880325b147dc2673c055f
4
+ data.tar.gz: f78d95a200b8dac652e0cc9a77363c57514c230d
5
+ SHA512:
6
+ metadata.gz: e1240191c6edec8d7a942504dccad5eb1aa539644ea3c852f050f6b9cea2b91fb76c6c2a310fa076c17aafdd9a8d6ed5ed14a8e355b8bb8be88faf8ab564f0b8
7
+ data.tar.gz: 65469d5164e9960c21608bafa694e907ad5ae25d3c281f2fad22b94ebbd6db80cb3789d2a66e4748d7c71021c8dd79194f456a848690bed5363afda60af4f03f
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,7 @@
1
+ language: ruby
2
+ before_script:
3
+ - 'export JRUBY_OPTS=--2.0'
4
+ rvm:
5
+ - 2.1.0
6
+ - 2.0.0
7
+ - jruby-20mode
data/Gemfile ADDED
@@ -0,0 +1,27 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in llt-tokenizer.gemspec
4
+ gemspec
5
+
6
+ gem 'coveralls', require: false
7
+
8
+ gem 'llt-core', git: 'git@github.com:latin-language-toolkit/llt-core.git'
9
+ gem 'llt-core_extensions', git: 'git@github.com:latin-language-toolkit/llt-core_extensions.git'
10
+ gem 'llt-constants', git: 'git@github.com:latin-language-toolkit/llt-constants.git'
11
+ gem 'llt-db_handler', git: 'git@github.com:latin-language-toolkit/llt-db_handler.git'
12
+ gem 'llt-helpers', git: 'git@github.com:latin-language-toolkit/llt-helpers.git'
13
+
14
+ # Dependencies of db_handler
15
+ gem 'llt-core_extensions', git: 'git@github.com:latin-language-toolkit/llt-core_extensions.git'
16
+ gem 'llt-form_builder', git: 'git@github.com:latin-language-toolkit/llt-form_builder.git'
17
+
18
+ platform :ruby do
19
+ gem 'pg'
20
+ end
21
+
22
+ platform :jruby do
23
+ gem 'activerecord-jdbcpostgresql-adapter'
24
+ gem 'jruby-httpclient'
25
+ end
26
+
27
+ gem 'pry'
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 LFDM
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,97 @@
1
+ # LLT::Tokenizer
2
+
3
+ Flexible service to tokenize Latin texts.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'llt-tokenizer'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install llt-tokenizer
18
+
19
+ ## Usage
20
+
21
+ The LLT's Tokenizer makes use of stem dictionaries. Refer to [these instructions](http://github.com/latin-language-toolkit/llt-db_handler "llt-db_handler") on how to set one up.
22
+
23
+ ```ruby
24
+ require 'llt/tokenizer'
25
+
26
+ t = LLT::Tokenizer.new
27
+ tokens = t.tokenize('Arma virumque cano.')
28
+ tokens.map(&:to_s)
29
+ # => ["Arma", "-que", "virum", "cano", "."]
30
+ ```
31
+
32
+ The Tokenizer takes several options upon creation or a call to #tokenize:
33
+
34
+ ```ruby
35
+ # shifting determines if enclitics shall be moved to
36
+ # their functional position
37
+ t = LLT::Tokenizer.new(shifting: true)
38
+ tokens = t.tokenize('In eoque arma cano.')
39
+ tokens.map(&:to_s)
40
+ # => ["-que", "In", "eo", "arma", "cano", "."]
41
+
42
+ # all options can be passed directly to #tokenize to override
43
+ # the default options
44
+ tokens = t.tokenize('In eoque arma cano.', shifting: false)
45
+ tokens.map(&:to_s)
46
+ # => ["In", "eo", "-que", "arma", "cano", "."]
47
+
48
+ # enclitics_marker takes a string, which marks up splitted enclitics
49
+ t = LLT::Tokenizer.new(enclitics_marker: '--', shifting: false)
50
+ tokens = t.tokenize('Arma virumque cano.')
51
+ tokens.map(&:to_s)
52
+ # => ["Arma", "virum", "--que", "cano", "."]
53
+
54
+ # indexing determines if each token shall receive a consecutive id
55
+ tokens = t.tokenize('Arma virumque cano.', indexing: true)
56
+ tokens.first.id # => 1
57
+ tokens = t.tokenize('Arma virumque cano.', indexing: false)
58
+ tokens.first.id # => nil
59
+
60
+ # merging enables token merging of lemmata, that often appear with
61
+ # orthographical inconsistencies
62
+ tokens = t.tokenize('Quam diu cano?', merging: true)
63
+ tokens.map(&:to_s)
64
+ # => ["Quamdiu", "cano", "?"]
65
+ ```
66
+
67
+ The returned items are instances of LLT::Token, which can be marked up
68
+ in a variety of forms:
69
+
70
+ ```ruby
71
+ t = LLT::Tokenizer.new(shifting: false, indexing: true)
72
+ tokens = t.tokenize('Arma virumque cano.')
73
+ tokens.map(&:to_xml)
74
+ # => ["<w>arma<_w>", "<w>virum<_w>", "<w>-que<_w>", "<w>cano<_w>", "<pc>.<_pc>"]
75
+ ```
76
+
77
+ Standard TEI XML markup is used: w tags for word tokens, pc tags for
78
+ punctuation. The #to_xml method is highly flexible as well, for full
79
+ coverage see _TODO_.
80
+
81
+ ```ruby
82
+ puts tokens.map { |token| token.to_xml(indexing: true) }
83
+ # <w n="1">Arma</w>
84
+ # <w n="2">virum</w>
85
+ # <w n="3">-que</w>
86
+ # <w n="4">cano</w>
87
+ # <pc n="5">.</pc>
88
+ ```
89
+
90
+
91
+ ## Contributing
92
+
93
+ 1. Fork it
94
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
95
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
96
+ 4. Push to the branch (`git push origin my-new-feature`)
97
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,31 @@
1
+ module LLT
2
+ class Token
3
+ class Filler < Token
4
+ xml_tag 'w'
5
+
6
+ def add(type)
7
+ case type
8
+ when :name then add_name_form
9
+ end
10
+ end
11
+
12
+ #def add_name_form
13
+ # @possible_forms << PersonaFiller.new(@word)
14
+ #end
15
+
16
+ # cannot hold anything atm, is therefore never really empty
17
+ def empty?
18
+ false
19
+ end
20
+ alias :no_forms? :empty?
21
+
22
+ def set_functions
23
+ [:filler]
24
+ end
25
+
26
+ def inspect
27
+ "#{"Filler token".blue}: #{@string}"
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,36 @@
1
+ module LLT
2
+ class Token
3
+ class Punctuation < Token
4
+ xml_tag 'pc'
5
+
6
+ attr_accessor :opening, :closing, :other
7
+
8
+ def initialize(string, id = nil)
9
+ super
10
+ # this is part of an old interface that is mostly unused
11
+ # some parts remain - find and delete em
12
+ @opening = false
13
+ @closing = false
14
+ @other = false
15
+ end
16
+
17
+ # cannot hold anything atm, is therefore never really empty
18
+ def empty?
19
+ false
20
+ end
21
+ alias :no_forms? :empty?
22
+
23
+ def set_functions
24
+ [:punctuation]
25
+ end
26
+
27
+ def punctuation
28
+ @string
29
+ end
30
+
31
+ def inspect
32
+ "#{"Punctuation token:".yellow} #{@string}"
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,53 @@
1
+ module LLT
2
+ class Token
3
+ class Word < Token
4
+ xml_tag 'w'
5
+
6
+ def word
7
+ @string
8
+ end
9
+
10
+ def no_forms?
11
+ @container.empty?
12
+ end
13
+
14
+ def set_functions
15
+ [:word]
16
+ end
17
+
18
+ def use(i = nil)
19
+ if i
20
+ return @container[i - 1]
21
+ elsif block_given?
22
+ @container.find { |f| yield(f) }
23
+ end
24
+ end
25
+
26
+ def inspect
27
+ "#{"Word token".green}: #{@string}\n" +
28
+ "\tForms: #{forms_to_s}\n"
29
+ end
30
+
31
+ def forms_to_s
32
+ # was each_with_index_and_object, which is currently not available
33
+ @container.each_with_index.each_with_object("") do |(f, i), str|
34
+ str << enumeration(i) << stripped_form(f)
35
+ str << delimiter unless f == @container.last
36
+ str
37
+ end
38
+ end
39
+
40
+ def stripped_form(form)
41
+ form.to_s.sub(@string, "").strip
42
+ end
43
+
44
+ def enumeration(i)
45
+ "#{i}: ".light_yellow
46
+ end
47
+
48
+ def delimiter
49
+ " | ".cyan
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,24 @@
1
+ module LLT
2
+ class Token
3
+ class XmlTag < Token
4
+ def set_functions
5
+ [:xml_tag]
6
+ end
7
+
8
+ # overrides #to_xml from Containable - the tag stays at is it
9
+ def to_xml(*args)
10
+ to_s
11
+ end
12
+
13
+ def inspect
14
+ "#{'XML tag'.blue} #{tag_status}: #{to_s}"
15
+ end
16
+
17
+ private
18
+
19
+ def tag_status
20
+ to_s.match(/\//) ? 'open' : 'close'
21
+ end
22
+ end
23
+ end
24
+ end
data/lib/llt/token.rb ADDED
@@ -0,0 +1,51 @@
1
+ require 'llt/core/containable'
2
+ require 'llt/helpers/functions'
3
+
4
+ module LLT
5
+ class Token
6
+ include Core::Containable
7
+ include Helpers::Functions
8
+ #include Phonology
9
+
10
+ require 'llt/token/word'
11
+ require 'llt/token/punctuation'
12
+ require 'llt/token/filler'
13
+ require 'llt/token/xml_tag'
14
+
15
+ attr_reader :functions, :special_roles
16
+
17
+ container_alias :forms
18
+
19
+ def initialize(string, id = nil)
20
+ super
21
+ @functions = set_functions
22
+ end
23
+
24
+ def special_roles
25
+ @special_roles || []
26
+ end
27
+
28
+ def has_special_role?(role)
29
+ special_roles.include?(role)
30
+ end
31
+
32
+ def set_special_role(*roles)
33
+ @special_roles ||= []
34
+ @special_roles += roles
35
+ end
36
+
37
+ # deprecated
38
+ def add_form(form)
39
+ @forms << form
40
+ end
41
+
42
+ # deprecated
43
+ def add_forms(forms)
44
+ @forms += forms
45
+ end
46
+
47
+ def use(*args)
48
+ # hook method, overwritten by Word
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,20 @@
1
+ require 'sinatra/base'
2
+ require 'sinatra/respond_with'
3
+ require 'llt/tokenizer'
4
+ require 'llt/core/api'
5
+
6
+ class Api < Sinatra::Base
7
+ register Sinatra::RespondWith
8
+ helpers LLT::Core::Api::Helpers
9
+
10
+ get '/tokenize' do
11
+ typecast_params!(params)
12
+ text = extract_text(params)
13
+ tokenizer = LLT::Tokenizer.new(params)
14
+ tokens = tokenizer.tokenize(text)
15
+
16
+ respond_to do |f|
17
+ f.xml { to_xml(tokens, params) }
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,5 @@
1
+ module LLT
2
+ class Tokenizer
3
+ VERSION = "0.0.1"
4
+ end
5
+ end
@@ -0,0 +1,106 @@
1
+ require 'forwardable'
2
+
3
+ module LLT
4
+ class Tokenizer
5
+ class Worker
6
+ extend Forwardable
7
+ include Enumerable
8
+ include Helpers::Metrical
9
+
10
+ def_delegators :@bare_text, :each, :[], :[]=, :insert, :delete_at,
11
+ :each_overlapping_pair, :map!
12
+
13
+ # TODO 28.11.13 11:45 by LFDM
14
+ # Edge cases?
15
+ # Merge words?
16
+
17
+ def initialize(metric_text, marker)
18
+ @metric_text = metric_text
19
+ @bare_text = metric_text.map { |token| wo_meter(token) }
20
+ @marker = marker
21
+ @marked_enclitics = ENCLITICS.map { |e| "#{@marker}#{e}"}
22
+ end
23
+
24
+ def to_a
25
+ align_metrical_text
26
+ @metric_text
27
+ end
28
+
29
+ private
30
+
31
+ # One ugly method, but we don't want to slow it down even more
32
+ def align_metrical_text
33
+ m = ArrayScanner.new(@metric_text)
34
+ b = ArrayScanner.new(@bare_text)
35
+ loop do
36
+ # metric element
37
+ x = m.scan
38
+ # bare element
39
+ y = b.scan
40
+ no_meter = wo_meter(x)
41
+
42
+ # we don't have to do anything if the dequantified metric element
43
+ # was the same as the bare element - the metric_text was right
44
+ # at this position
45
+ unless no_meter == y
46
+
47
+ # If the bare element was a marked enclitic, it must have been
48
+ # shifted. We're looking for the next metric token, that has it
49
+ # attached and try to find the string index where it starts to
50
+ # slice it of.
51
+ # Usually the metric element just scanned (y) will have it, if we
52
+ # don't find it, a double shift has occured and it should sit right
53
+ # at the current element of the metric ArrayScanner (m).
54
+ # The enclitic (sliced of x) has to be inserted one position before.
55
+ if @marked_enclitics.include?(y)
56
+ clean_encl_re = /#{y.dup.delete(@marker)}$/
57
+ unless index = no_meter =~ clean_encl_re
58
+ x = m.current
59
+ index = wo_meter(x) =~ clean_encl_re
60
+ end
61
+ insert!(slice_encl!(x, index), m.pos - 1)
62
+
63
+ # If the dequantified metric element has an enclitic attached, the
64
+ # option shifting: false must have been given. The enclitic will
65
+ # follow right after in the @bare_text, we can therefore slice and
66
+ # insert right in place (the next # scan round will reveal that
67
+ # enclitic in metric_text == enclitic in bare_text
68
+ elsif encl = ENCLITICS.find { |e| no_meter.end_with?(e) }
69
+ index = no_meter =~ /#{encl}$/
70
+ insert!(slice_encl!(x, index), m.pos)
71
+
72
+ # If the bare element has a dot attached, it must have been an
73
+ # abbreviation.
74
+ # The . will appear right afterwards in the metric text. We can
75
+ # delete it and append it to the last scanned metric element (x)
76
+ #
77
+ # We need to do the same if merge words were present.
78
+ # The last metric element was quam, the bare element is quamdiu.
79
+ # We append if the last metric element + the next metric element
80
+ # is the same as the bare element.
81
+ elsif y.end_with?('.') || merged_words_present?(no_meter, y, m)
82
+ append_from_deleted_index!(x, m.pos)
83
+ end
84
+ end
85
+ break if b.eoa?
86
+ end
87
+ end
88
+
89
+ def insert!(enclitic, position)
90
+ @metric_text.insert(position, "#{@marker}#{enclitic}")
91
+ end
92
+
93
+ def slice_encl!(token, index)
94
+ token.slice!(index..-1)
95
+ end
96
+
97
+ def append_from_deleted_index!(token, index)
98
+ token << @metric_text.delete_at(index)
99
+ end
100
+
101
+ def merged_words_present?(last_metric, last_bare, metric_arr_scanner)
102
+ (last_metric + wo_meter(metric_arr_scanner.peek)) == last_bare
103
+ end
104
+ end
105
+ end
106
+ end