llt-tokenizer 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.rspec +2 -0
- data/.travis.yml +7 -0
- data/Gemfile +27 -0
- data/LICENSE.txt +22 -0
- data/README.md +97 -0
- data/Rakefile +6 -0
- data/lib/llt/token/filler.rb +31 -0
- data/lib/llt/token/punctuation.rb +36 -0
- data/lib/llt/token/word.rb +53 -0
- data/lib/llt/token/xml_tag.rb +24 -0
- data/lib/llt/token.rb +51 -0
- data/lib/llt/tokenizer/api.rb +20 -0
- data/lib/llt/tokenizer/version.rb +5 -0
- data/lib/llt/tokenizer/worker.rb +106 -0
- data/lib/llt/tokenizer.rb +362 -0
- data/llt-tokenizer.gemspec +30 -0
- data/spec/lib/llt/tokenizer/api_spec.rb +58 -0
- data/spec/lib/llt/tokenizer_spec.rb +361 -0
- data/spec/spec_helper.rb +28 -0
- data/spec/support/matchers/tokenizer.rb +5 -0
- metadata +195 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: c59bde34626f221dcf1880325b147dc2673c055f
|
4
|
+
data.tar.gz: f78d95a200b8dac652e0cc9a77363c57514c230d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e1240191c6edec8d7a942504dccad5eb1aa539644ea3c852f050f6b9cea2b91fb76c6c2a310fa076c17aafdd9a8d6ed5ed14a8e355b8bb8be88faf8ab564f0b8
|
7
|
+
data.tar.gz: 65469d5164e9960c21608bafa694e907ad5ae25d3c281f2fad22b94ebbd6db80cb3789d2a66e4748d7c71021c8dd79194f456a848690bed5363afda60af4f03f
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in llt-tokenizer.gemspec
|
4
|
+
gemspec
|
5
|
+
|
6
|
+
gem 'coveralls', require: false
|
7
|
+
|
8
|
+
gem 'llt-core', git: 'git@github.com:latin-language-toolkit/llt-core.git'
|
9
|
+
gem 'llt-core_extensions', git: 'git@github.com:latin-language-toolkit/llt-core_extensions.git'
|
10
|
+
gem 'llt-constants', git: 'git@github.com:latin-language-toolkit/llt-constants.git'
|
11
|
+
gem 'llt-db_handler', git: 'git@github.com:latin-language-toolkit/llt-db_handler.git'
|
12
|
+
gem 'llt-helpers', git: 'git@github.com:latin-language-toolkit/llt-helpers.git'
|
13
|
+
|
14
|
+
# Dependencies of db_handler
|
15
|
+
gem 'llt-core_extensions', git: 'git@github.com:latin-language-toolkit/llt-core_extensions.git'
|
16
|
+
gem 'llt-form_builder', git: 'git@github.com:latin-language-toolkit/llt-form_builder.git'
|
17
|
+
|
18
|
+
platform :ruby do
|
19
|
+
gem 'pg'
|
20
|
+
end
|
21
|
+
|
22
|
+
platform :jruby do
|
23
|
+
gem 'activerecord-jdbcpostgresql-adapter'
|
24
|
+
gem 'jruby-httpclient'
|
25
|
+
end
|
26
|
+
|
27
|
+
gem 'pry'
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 LFDM
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
# LLT::Tokenizer
|
2
|
+
|
3
|
+
Flexible service to tokenize Latin texts.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'llt-tokenizer'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install llt-tokenizer
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
The LLT's Tokenizer makes use of stem dictionaries. Refer to [these instructions](http://github.com/latin-language-toolkit/llt-db_handler "llt-db_handler") on how to set one up.
|
22
|
+
|
23
|
+
```ruby
|
24
|
+
require 'llt/tokenizer'
|
25
|
+
|
26
|
+
t = LLT::Tokenizer.new
|
27
|
+
tokens = t.tokenize('Arma virumque cano.')
|
28
|
+
tokens.map(&:to_s)
|
29
|
+
# => ["Arma", "-que", "virum", "cano", "."]
|
30
|
+
```
|
31
|
+
|
32
|
+
The Tokenizer takes several options upon creation or a call to #tokenize:
|
33
|
+
|
34
|
+
```ruby
|
35
|
+
# shifting determines if enclitics shall be moved to
|
36
|
+
# their functional position
|
37
|
+
t = LLT::Tokenizer.new(shifting: true)
|
38
|
+
tokens = t.tokenize('In eoque arma cano.')
|
39
|
+
tokens.map(&:to_s)
|
40
|
+
# => ["-que", "In", "eo", "arma", "cano", "."]
|
41
|
+
|
42
|
+
# all options can be passed directly to #tokenize to override
|
43
|
+
# the default options
|
44
|
+
tokens = t.tokenize('In eoque arma cano.', shifting: false)
|
45
|
+
tokens.map(&:to_s)
|
46
|
+
# => ["In", "eo", "-que", "arma", "cano", "."]
|
47
|
+
|
48
|
+
# enclitics_marker takes a string, which marks up splitted enclitics
|
49
|
+
t = LLT::Tokenizer.new(enclitics_marker: '--', shifting: false)
|
50
|
+
tokens = t.tokenize('Arma virumque cano.')
|
51
|
+
tokens.map(&:to_s)
|
52
|
+
# => ["Arma", "virum", "--que", "cano", "."]
|
53
|
+
|
54
|
+
# indexing determines if each token shall receive a consecutive id
|
55
|
+
tokens = t.tokenize('Arma virumque cano.', indexing: true)
|
56
|
+
tokens.first.id # => 1
|
57
|
+
tokens = t.tokenize('Arma virumque cano.', indexing: false)
|
58
|
+
tokens.first.id # => nil
|
59
|
+
|
60
|
+
# merging enables token merging of lemmata, that often appear with
|
61
|
+
# orthographical inconsistencies
|
62
|
+
tokens = t.tokenize('Quam diu cano?', merging: true)
|
63
|
+
tokens.map(&:to_s)
|
64
|
+
# => ["Quamdiu", "cano", "?"]
|
65
|
+
```
|
66
|
+
|
67
|
+
The returned items are instances of LLT::Token, which can be marked up
|
68
|
+
in a variety of forms:
|
69
|
+
|
70
|
+
```ruby
|
71
|
+
t = LLT::Tokenizer.new(shifting: false, indexing: true)
|
72
|
+
tokens = t.tokenize('Arma virumque cano.')
|
73
|
+
tokens.map(&:to_xml)
|
74
|
+
# => ["<w>arma<_w>", "<w>virum<_w>", "<w>-que<_w>", "<w>cano<_w>", "<pc>.<_pc>"]
|
75
|
+
```
|
76
|
+
|
77
|
+
Standard TEI XML markup is used: w tags for word tokens, pc tags for
|
78
|
+
punctuation. The #to_xml method is highly flexible as well, for full
|
79
|
+
coverage see _TODO_.
|
80
|
+
|
81
|
+
```ruby
|
82
|
+
puts tokens.map { |token| token.to_xml(indexing: true) }
|
83
|
+
# <w n="1">Arma</w>
|
84
|
+
# <w n="2">virum</w>
|
85
|
+
# <w n="3">-que</w>
|
86
|
+
# <w n="4">cano</w>
|
87
|
+
# <pc n="5">.</pc>
|
88
|
+
```
|
89
|
+
|
90
|
+
|
91
|
+
## Contributing
|
92
|
+
|
93
|
+
1. Fork it
|
94
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
95
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
96
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
97
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
module LLT
|
2
|
+
class Token
|
3
|
+
class Filler < Token
|
4
|
+
xml_tag 'w'
|
5
|
+
|
6
|
+
def add(type)
|
7
|
+
case type
|
8
|
+
when :name then add_name_form
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
#def add_name_form
|
13
|
+
# @possible_forms << PersonaFiller.new(@word)
|
14
|
+
#end
|
15
|
+
|
16
|
+
# cannot hold anything atm, is therefore never really empty
|
17
|
+
def empty?
|
18
|
+
false
|
19
|
+
end
|
20
|
+
alias :no_forms? :empty?
|
21
|
+
|
22
|
+
def set_functions
|
23
|
+
[:filler]
|
24
|
+
end
|
25
|
+
|
26
|
+
def inspect
|
27
|
+
"#{"Filler token".blue}: #{@string}"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module LLT
|
2
|
+
class Token
|
3
|
+
class Punctuation < Token
|
4
|
+
xml_tag 'pc'
|
5
|
+
|
6
|
+
attr_accessor :opening, :closing, :other
|
7
|
+
|
8
|
+
def initialize(string, id = nil)
|
9
|
+
super
|
10
|
+
# this is part of an old interface that is mostly unused
|
11
|
+
# some parts remain - find and delete em
|
12
|
+
@opening = false
|
13
|
+
@closing = false
|
14
|
+
@other = false
|
15
|
+
end
|
16
|
+
|
17
|
+
# cannot hold anything atm, is therefore never really empty
|
18
|
+
def empty?
|
19
|
+
false
|
20
|
+
end
|
21
|
+
alias :no_forms? :empty?
|
22
|
+
|
23
|
+
def set_functions
|
24
|
+
[:punctuation]
|
25
|
+
end
|
26
|
+
|
27
|
+
def punctuation
|
28
|
+
@string
|
29
|
+
end
|
30
|
+
|
31
|
+
def inspect
|
32
|
+
"#{"Punctuation token:".yellow} #{@string}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module LLT
|
2
|
+
class Token
|
3
|
+
class Word < Token
|
4
|
+
xml_tag 'w'
|
5
|
+
|
6
|
+
def word
|
7
|
+
@string
|
8
|
+
end
|
9
|
+
|
10
|
+
def no_forms?
|
11
|
+
@container.empty?
|
12
|
+
end
|
13
|
+
|
14
|
+
def set_functions
|
15
|
+
[:word]
|
16
|
+
end
|
17
|
+
|
18
|
+
def use(i = nil)
|
19
|
+
if i
|
20
|
+
return @container[i - 1]
|
21
|
+
elsif block_given?
|
22
|
+
@container.find { |f| yield(f) }
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def inspect
|
27
|
+
"#{"Word token".green}: #{@string}\n" +
|
28
|
+
"\tForms: #{forms_to_s}\n"
|
29
|
+
end
|
30
|
+
|
31
|
+
def forms_to_s
|
32
|
+
# was each_with_index_and_object, which is currently not available
|
33
|
+
@container.each_with_index.each_with_object("") do |(f, i), str|
|
34
|
+
str << enumeration(i) << stripped_form(f)
|
35
|
+
str << delimiter unless f == @container.last
|
36
|
+
str
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def stripped_form(form)
|
41
|
+
form.to_s.sub(@string, "").strip
|
42
|
+
end
|
43
|
+
|
44
|
+
def enumeration(i)
|
45
|
+
"#{i}: ".light_yellow
|
46
|
+
end
|
47
|
+
|
48
|
+
def delimiter
|
49
|
+
" | ".cyan
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module LLT
|
2
|
+
class Token
|
3
|
+
class XmlTag < Token
|
4
|
+
def set_functions
|
5
|
+
[:xml_tag]
|
6
|
+
end
|
7
|
+
|
8
|
+
# overrides #to_xml from Containable - the tag stays at is it
|
9
|
+
def to_xml(*args)
|
10
|
+
to_s
|
11
|
+
end
|
12
|
+
|
13
|
+
def inspect
|
14
|
+
"#{'XML tag'.blue} #{tag_status}: #{to_s}"
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def tag_status
|
20
|
+
to_s.match(/\//) ? 'open' : 'close'
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
data/lib/llt/token.rb
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'llt/core/containable'
|
2
|
+
require 'llt/helpers/functions'
|
3
|
+
|
4
|
+
module LLT
|
5
|
+
class Token
|
6
|
+
include Core::Containable
|
7
|
+
include Helpers::Functions
|
8
|
+
#include Phonology
|
9
|
+
|
10
|
+
require 'llt/token/word'
|
11
|
+
require 'llt/token/punctuation'
|
12
|
+
require 'llt/token/filler'
|
13
|
+
require 'llt/token/xml_tag'
|
14
|
+
|
15
|
+
attr_reader :functions, :special_roles
|
16
|
+
|
17
|
+
container_alias :forms
|
18
|
+
|
19
|
+
def initialize(string, id = nil)
|
20
|
+
super
|
21
|
+
@functions = set_functions
|
22
|
+
end
|
23
|
+
|
24
|
+
def special_roles
|
25
|
+
@special_roles || []
|
26
|
+
end
|
27
|
+
|
28
|
+
def has_special_role?(role)
|
29
|
+
special_roles.include?(role)
|
30
|
+
end
|
31
|
+
|
32
|
+
def set_special_role(*roles)
|
33
|
+
@special_roles ||= []
|
34
|
+
@special_roles += roles
|
35
|
+
end
|
36
|
+
|
37
|
+
# deprecated
|
38
|
+
def add_form(form)
|
39
|
+
@forms << form
|
40
|
+
end
|
41
|
+
|
42
|
+
# deprecated
|
43
|
+
def add_forms(forms)
|
44
|
+
@forms += forms
|
45
|
+
end
|
46
|
+
|
47
|
+
def use(*args)
|
48
|
+
# hook method, overwritten by Word
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'sinatra/base'
|
2
|
+
require 'sinatra/respond_with'
|
3
|
+
require 'llt/tokenizer'
|
4
|
+
require 'llt/core/api'
|
5
|
+
|
6
|
+
class Api < Sinatra::Base
|
7
|
+
register Sinatra::RespondWith
|
8
|
+
helpers LLT::Core::Api::Helpers
|
9
|
+
|
10
|
+
get '/tokenize' do
|
11
|
+
typecast_params!(params)
|
12
|
+
text = extract_text(params)
|
13
|
+
tokenizer = LLT::Tokenizer.new(params)
|
14
|
+
tokens = tokenizer.tokenize(text)
|
15
|
+
|
16
|
+
respond_to do |f|
|
17
|
+
f.xml { to_xml(tokens, params) }
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
|
3
|
+
module LLT
|
4
|
+
class Tokenizer
|
5
|
+
class Worker
|
6
|
+
extend Forwardable
|
7
|
+
include Enumerable
|
8
|
+
include Helpers::Metrical
|
9
|
+
|
10
|
+
def_delegators :@bare_text, :each, :[], :[]=, :insert, :delete_at,
|
11
|
+
:each_overlapping_pair, :map!
|
12
|
+
|
13
|
+
# TODO 28.11.13 11:45 by LFDM
|
14
|
+
# Edge cases?
|
15
|
+
# Merge words?
|
16
|
+
|
17
|
+
def initialize(metric_text, marker)
|
18
|
+
@metric_text = metric_text
|
19
|
+
@bare_text = metric_text.map { |token| wo_meter(token) }
|
20
|
+
@marker = marker
|
21
|
+
@marked_enclitics = ENCLITICS.map { |e| "#{@marker}#{e}"}
|
22
|
+
end
|
23
|
+
|
24
|
+
def to_a
|
25
|
+
align_metrical_text
|
26
|
+
@metric_text
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
# One ugly method, but we don't want to slow it down even more
|
32
|
+
def align_metrical_text
|
33
|
+
m = ArrayScanner.new(@metric_text)
|
34
|
+
b = ArrayScanner.new(@bare_text)
|
35
|
+
loop do
|
36
|
+
# metric element
|
37
|
+
x = m.scan
|
38
|
+
# bare element
|
39
|
+
y = b.scan
|
40
|
+
no_meter = wo_meter(x)
|
41
|
+
|
42
|
+
# we don't have to do anything if the dequantified metric element
|
43
|
+
# was the same as the bare element - the metric_text was right
|
44
|
+
# at this position
|
45
|
+
unless no_meter == y
|
46
|
+
|
47
|
+
# If the bare element was a marked enclitic, it must have been
|
48
|
+
# shifted. We're looking for the next metric token, that has it
|
49
|
+
# attached and try to find the string index where it starts to
|
50
|
+
# slice it of.
|
51
|
+
# Usually the metric element just scanned (y) will have it, if we
|
52
|
+
# don't find it, a double shift has occured and it should sit right
|
53
|
+
# at the current element of the metric ArrayScanner (m).
|
54
|
+
# The enclitic (sliced of x) has to be inserted one position before.
|
55
|
+
if @marked_enclitics.include?(y)
|
56
|
+
clean_encl_re = /#{y.dup.delete(@marker)}$/
|
57
|
+
unless index = no_meter =~ clean_encl_re
|
58
|
+
x = m.current
|
59
|
+
index = wo_meter(x) =~ clean_encl_re
|
60
|
+
end
|
61
|
+
insert!(slice_encl!(x, index), m.pos - 1)
|
62
|
+
|
63
|
+
# If the dequantified metric element has an enclitic attached, the
|
64
|
+
# option shifting: false must have been given. The enclitic will
|
65
|
+
# follow right after in the @bare_text, we can therefore slice and
|
66
|
+
# insert right in place (the next # scan round will reveal that
|
67
|
+
# enclitic in metric_text == enclitic in bare_text
|
68
|
+
elsif encl = ENCLITICS.find { |e| no_meter.end_with?(e) }
|
69
|
+
index = no_meter =~ /#{encl}$/
|
70
|
+
insert!(slice_encl!(x, index), m.pos)
|
71
|
+
|
72
|
+
# If the bare element has a dot attached, it must have been an
|
73
|
+
# abbreviation.
|
74
|
+
# The . will appear right afterwards in the metric text. We can
|
75
|
+
# delete it and append it to the last scanned metric element (x)
|
76
|
+
#
|
77
|
+
# We need to do the same if merge words were present.
|
78
|
+
# The last metric element was quam, the bare element is quamdiu.
|
79
|
+
# We append if the last metric element + the next metric element
|
80
|
+
# is the same as the bare element.
|
81
|
+
elsif y.end_with?('.') || merged_words_present?(no_meter, y, m)
|
82
|
+
append_from_deleted_index!(x, m.pos)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
break if b.eoa?
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def insert!(enclitic, position)
|
90
|
+
@metric_text.insert(position, "#{@marker}#{enclitic}")
|
91
|
+
end
|
92
|
+
|
93
|
+
def slice_encl!(token, index)
|
94
|
+
token.slice!(index..-1)
|
95
|
+
end
|
96
|
+
|
97
|
+
def append_from_deleted_index!(token, index)
|
98
|
+
token << @metric_text.delete_at(index)
|
99
|
+
end
|
100
|
+
|
101
|
+
def merged_words_present?(last_metric, last_bare, metric_arr_scanner)
|
102
|
+
(last_metric + wo_meter(metric_arr_scanner.peek)) == last_bare
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|