llt-tokenizer 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.rspec +2 -0
- data/.travis.yml +7 -0
- data/Gemfile +27 -0
- data/LICENSE.txt +22 -0
- data/README.md +97 -0
- data/Rakefile +6 -0
- data/lib/llt/token/filler.rb +31 -0
- data/lib/llt/token/punctuation.rb +36 -0
- data/lib/llt/token/word.rb +53 -0
- data/lib/llt/token/xml_tag.rb +24 -0
- data/lib/llt/token.rb +51 -0
- data/lib/llt/tokenizer/api.rb +20 -0
- data/lib/llt/tokenizer/version.rb +5 -0
- data/lib/llt/tokenizer/worker.rb +106 -0
- data/lib/llt/tokenizer.rb +362 -0
- data/llt-tokenizer.gemspec +30 -0
- data/spec/lib/llt/tokenizer/api_spec.rb +58 -0
- data/spec/lib/llt/tokenizer_spec.rb +361 -0
- data/spec/spec_helper.rb +28 -0
- data/spec/support/matchers/tokenizer.rb +5 -0
- metadata +195 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: c59bde34626f221dcf1880325b147dc2673c055f
|
4
|
+
data.tar.gz: f78d95a200b8dac652e0cc9a77363c57514c230d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e1240191c6edec8d7a942504dccad5eb1aa539644ea3c852f050f6b9cea2b91fb76c6c2a310fa076c17aafdd9a8d6ed5ed14a8e355b8bb8be88faf8ab564f0b8
|
7
|
+
data.tar.gz: 65469d5164e9960c21608bafa694e907ad5ae25d3c281f2fad22b94ebbd6db80cb3789d2a66e4748d7c71021c8dd79194f456a848690bed5363afda60af4f03f
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in llt-tokenizer.gemspec
|
4
|
+
gemspec
|
5
|
+
|
6
|
+
gem 'coveralls', require: false
|
7
|
+
|
8
|
+
gem 'llt-core', git: 'git@github.com:latin-language-toolkit/llt-core.git'
|
9
|
+
gem 'llt-core_extensions', git: 'git@github.com:latin-language-toolkit/llt-core_extensions.git'
|
10
|
+
gem 'llt-constants', git: 'git@github.com:latin-language-toolkit/llt-constants.git'
|
11
|
+
gem 'llt-db_handler', git: 'git@github.com:latin-language-toolkit/llt-db_handler.git'
|
12
|
+
gem 'llt-helpers', git: 'git@github.com:latin-language-toolkit/llt-helpers.git'
|
13
|
+
|
14
|
+
# Dependencies of db_handler
|
15
|
+
gem 'llt-core_extensions', git: 'git@github.com:latin-language-toolkit/llt-core_extensions.git'
|
16
|
+
gem 'llt-form_builder', git: 'git@github.com:latin-language-toolkit/llt-form_builder.git'
|
17
|
+
|
18
|
+
platform :ruby do
|
19
|
+
gem 'pg'
|
20
|
+
end
|
21
|
+
|
22
|
+
platform :jruby do
|
23
|
+
gem 'activerecord-jdbcpostgresql-adapter'
|
24
|
+
gem 'jruby-httpclient'
|
25
|
+
end
|
26
|
+
|
27
|
+
gem 'pry'
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 LFDM
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
# LLT::Tokenizer
|
2
|
+
|
3
|
+
Flexible service to tokenize Latin texts.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'llt-tokenizer'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install llt-tokenizer
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
The LLT's Tokenizer makes use of stem dictionaries. Refer to [these instructions](http://github.com/latin-language-toolkit/llt-db_handler "llt-db_handler") on how to set one up.
|
22
|
+
|
23
|
+
```ruby
|
24
|
+
require 'llt/tokenizer'
|
25
|
+
|
26
|
+
t = LLT::Tokenizer.new
|
27
|
+
tokens = t.tokenize('Arma virumque cano.')
|
28
|
+
tokens.map(&:to_s)
|
29
|
+
# => ["Arma", "-que", "virum", "cano", "."]
|
30
|
+
```
|
31
|
+
|
32
|
+
The Tokenizer takes several options upon creation or a call to #tokenize:
|
33
|
+
|
34
|
+
```ruby
|
35
|
+
# shifting determines if enclitics shall be moved to
|
36
|
+
# their functional position
|
37
|
+
t = LLT::Tokenizer.new(shifting: true)
|
38
|
+
tokens = t.tokenize('In eoque arma cano.')
|
39
|
+
tokens.map(&:to_s)
|
40
|
+
# => ["-que", "In", "eo", "arma", "cano", "."]
|
41
|
+
|
42
|
+
# all options can be passed directly to #tokenize to override
|
43
|
+
# the default options
|
44
|
+
tokens = t.tokenize('In eoque arma cano.', shifting: false)
|
45
|
+
tokens.map(&:to_s)
|
46
|
+
# => ["In", "eo", "-que", "arma", "cano", "."]
|
47
|
+
|
48
|
+
# enclitics_marker takes a string, which marks up splitted enclitics
|
49
|
+
t = LLT::Tokenizer.new(enclitics_marker: '--', shifting: false)
|
50
|
+
tokens = t.tokenize('Arma virumque cano.')
|
51
|
+
tokens.map(&:to_s)
|
52
|
+
# => ["Arma", "virum", "--que", "cano", "."]
|
53
|
+
|
54
|
+
# indexing determines if each token shall receive a consecutive id
|
55
|
+
tokens = t.tokenize('Arma virumque cano.', indexing: true)
|
56
|
+
tokens.first.id # => 1
|
57
|
+
tokens = t.tokenize('Arma virumque cano.', indexing: false)
|
58
|
+
tokens.first.id # => nil
|
59
|
+
|
60
|
+
# merging enables token merging of lemmata, that often appear with
|
61
|
+
# orthographical inconsistencies
|
62
|
+
tokens = t.tokenize('Quam diu cano?', merging: true)
|
63
|
+
tokens.map(&:to_s)
|
64
|
+
# => ["Quamdiu", "cano", "?"]
|
65
|
+
```
|
66
|
+
|
67
|
+
The returned items are instances of LLT::Token, which can be marked up
|
68
|
+
in a variety of forms:
|
69
|
+
|
70
|
+
```ruby
|
71
|
+
t = LLT::Tokenizer.new(shifting: false, indexing: true)
|
72
|
+
tokens = t.tokenize('Arma virumque cano.')
|
73
|
+
tokens.map(&:to_xml)
|
74
|
+
# => ["<w>arma<_w>", "<w>virum<_w>", "<w>-que<_w>", "<w>cano<_w>", "<pc>.<_pc>"]
|
75
|
+
```
|
76
|
+
|
77
|
+
Standard TEI XML markup is used: w tags for word tokens, pc tags for
|
78
|
+
punctuation. The #to_xml method is highly flexible as well, for full
|
79
|
+
coverage see _TODO_.
|
80
|
+
|
81
|
+
```ruby
|
82
|
+
puts tokens.map { |token| token.to_xml(indexing: true) }
|
83
|
+
# <w n="1">Arma</w>
|
84
|
+
# <w n="2">virum</w>
|
85
|
+
# <w n="3">-que</w>
|
86
|
+
# <w n="4">cano</w>
|
87
|
+
# <pc n="5">.</pc>
|
88
|
+
```
|
89
|
+
|
90
|
+
|
91
|
+
## Contributing
|
92
|
+
|
93
|
+
1. Fork it
|
94
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
95
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
96
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
97
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
module LLT
|
2
|
+
class Token
|
3
|
+
class Filler < Token
|
4
|
+
xml_tag 'w'
|
5
|
+
|
6
|
+
def add(type)
|
7
|
+
case type
|
8
|
+
when :name then add_name_form
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
#def add_name_form
|
13
|
+
# @possible_forms << PersonaFiller.new(@word)
|
14
|
+
#end
|
15
|
+
|
16
|
+
# cannot hold anything atm, is therefore never really empty
|
17
|
+
def empty?
|
18
|
+
false
|
19
|
+
end
|
20
|
+
alias :no_forms? :empty?
|
21
|
+
|
22
|
+
def set_functions
|
23
|
+
[:filler]
|
24
|
+
end
|
25
|
+
|
26
|
+
def inspect
|
27
|
+
"#{"Filler token".blue}: #{@string}"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module LLT
|
2
|
+
class Token
|
3
|
+
class Punctuation < Token
|
4
|
+
xml_tag 'pc'
|
5
|
+
|
6
|
+
attr_accessor :opening, :closing, :other
|
7
|
+
|
8
|
+
def initialize(string, id = nil)
|
9
|
+
super
|
10
|
+
# this is part of an old interface that is mostly unused
|
11
|
+
# some parts remain - find and delete em
|
12
|
+
@opening = false
|
13
|
+
@closing = false
|
14
|
+
@other = false
|
15
|
+
end
|
16
|
+
|
17
|
+
# cannot hold anything atm, is therefore never really empty
|
18
|
+
def empty?
|
19
|
+
false
|
20
|
+
end
|
21
|
+
alias :no_forms? :empty?
|
22
|
+
|
23
|
+
def set_functions
|
24
|
+
[:punctuation]
|
25
|
+
end
|
26
|
+
|
27
|
+
def punctuation
|
28
|
+
@string
|
29
|
+
end
|
30
|
+
|
31
|
+
def inspect
|
32
|
+
"#{"Punctuation token:".yellow} #{@string}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module LLT
|
2
|
+
class Token
|
3
|
+
class Word < Token
|
4
|
+
xml_tag 'w'
|
5
|
+
|
6
|
+
def word
|
7
|
+
@string
|
8
|
+
end
|
9
|
+
|
10
|
+
def no_forms?
|
11
|
+
@container.empty?
|
12
|
+
end
|
13
|
+
|
14
|
+
def set_functions
|
15
|
+
[:word]
|
16
|
+
end
|
17
|
+
|
18
|
+
def use(i = nil)
|
19
|
+
if i
|
20
|
+
return @container[i - 1]
|
21
|
+
elsif block_given?
|
22
|
+
@container.find { |f| yield(f) }
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def inspect
|
27
|
+
"#{"Word token".green}: #{@string}\n" +
|
28
|
+
"\tForms: #{forms_to_s}\n"
|
29
|
+
end
|
30
|
+
|
31
|
+
def forms_to_s
|
32
|
+
# was each_with_index_and_object, which is currently not available
|
33
|
+
@container.each_with_index.each_with_object("") do |(f, i), str|
|
34
|
+
str << enumeration(i) << stripped_form(f)
|
35
|
+
str << delimiter unless f == @container.last
|
36
|
+
str
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def stripped_form(form)
|
41
|
+
form.to_s.sub(@string, "").strip
|
42
|
+
end
|
43
|
+
|
44
|
+
def enumeration(i)
|
45
|
+
"#{i}: ".light_yellow
|
46
|
+
end
|
47
|
+
|
48
|
+
def delimiter
|
49
|
+
" | ".cyan
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module LLT
|
2
|
+
class Token
|
3
|
+
class XmlTag < Token
|
4
|
+
def set_functions
|
5
|
+
[:xml_tag]
|
6
|
+
end
|
7
|
+
|
8
|
+
# overrides #to_xml from Containable - the tag stays at is it
|
9
|
+
def to_xml(*args)
|
10
|
+
to_s
|
11
|
+
end
|
12
|
+
|
13
|
+
def inspect
|
14
|
+
"#{'XML tag'.blue} #{tag_status}: #{to_s}"
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def tag_status
|
20
|
+
to_s.match(/\//) ? 'open' : 'close'
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
data/lib/llt/token.rb
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'llt/core/containable'
|
2
|
+
require 'llt/helpers/functions'
|
3
|
+
|
4
|
+
module LLT
|
5
|
+
class Token
|
6
|
+
include Core::Containable
|
7
|
+
include Helpers::Functions
|
8
|
+
#include Phonology
|
9
|
+
|
10
|
+
require 'llt/token/word'
|
11
|
+
require 'llt/token/punctuation'
|
12
|
+
require 'llt/token/filler'
|
13
|
+
require 'llt/token/xml_tag'
|
14
|
+
|
15
|
+
attr_reader :functions, :special_roles
|
16
|
+
|
17
|
+
container_alias :forms
|
18
|
+
|
19
|
+
def initialize(string, id = nil)
|
20
|
+
super
|
21
|
+
@functions = set_functions
|
22
|
+
end
|
23
|
+
|
24
|
+
def special_roles
|
25
|
+
@special_roles || []
|
26
|
+
end
|
27
|
+
|
28
|
+
def has_special_role?(role)
|
29
|
+
special_roles.include?(role)
|
30
|
+
end
|
31
|
+
|
32
|
+
def set_special_role(*roles)
|
33
|
+
@special_roles ||= []
|
34
|
+
@special_roles += roles
|
35
|
+
end
|
36
|
+
|
37
|
+
# deprecated
|
38
|
+
def add_form(form)
|
39
|
+
@forms << form
|
40
|
+
end
|
41
|
+
|
42
|
+
# deprecated
|
43
|
+
def add_forms(forms)
|
44
|
+
@forms += forms
|
45
|
+
end
|
46
|
+
|
47
|
+
def use(*args)
|
48
|
+
# hook method, overwritten by Word
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'sinatra/base'
|
2
|
+
require 'sinatra/respond_with'
|
3
|
+
require 'llt/tokenizer'
|
4
|
+
require 'llt/core/api'
|
5
|
+
|
6
|
+
class Api < Sinatra::Base
|
7
|
+
register Sinatra::RespondWith
|
8
|
+
helpers LLT::Core::Api::Helpers
|
9
|
+
|
10
|
+
get '/tokenize' do
|
11
|
+
typecast_params!(params)
|
12
|
+
text = extract_text(params)
|
13
|
+
tokenizer = LLT::Tokenizer.new(params)
|
14
|
+
tokens = tokenizer.tokenize(text)
|
15
|
+
|
16
|
+
respond_to do |f|
|
17
|
+
f.xml { to_xml(tokens, params) }
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
|
3
|
+
module LLT
|
4
|
+
class Tokenizer
|
5
|
+
class Worker
|
6
|
+
extend Forwardable
|
7
|
+
include Enumerable
|
8
|
+
include Helpers::Metrical
|
9
|
+
|
10
|
+
def_delegators :@bare_text, :each, :[], :[]=, :insert, :delete_at,
|
11
|
+
:each_overlapping_pair, :map!
|
12
|
+
|
13
|
+
# TODO 28.11.13 11:45 by LFDM
|
14
|
+
# Edge cases?
|
15
|
+
# Merge words?
|
16
|
+
|
17
|
+
def initialize(metric_text, marker)
|
18
|
+
@metric_text = metric_text
|
19
|
+
@bare_text = metric_text.map { |token| wo_meter(token) }
|
20
|
+
@marker = marker
|
21
|
+
@marked_enclitics = ENCLITICS.map { |e| "#{@marker}#{e}"}
|
22
|
+
end
|
23
|
+
|
24
|
+
def to_a
|
25
|
+
align_metrical_text
|
26
|
+
@metric_text
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
# One ugly method, but we don't want to slow it down even more
|
32
|
+
def align_metrical_text
|
33
|
+
m = ArrayScanner.new(@metric_text)
|
34
|
+
b = ArrayScanner.new(@bare_text)
|
35
|
+
loop do
|
36
|
+
# metric element
|
37
|
+
x = m.scan
|
38
|
+
# bare element
|
39
|
+
y = b.scan
|
40
|
+
no_meter = wo_meter(x)
|
41
|
+
|
42
|
+
# we don't have to do anything if the dequantified metric element
|
43
|
+
# was the same as the bare element - the metric_text was right
|
44
|
+
# at this position
|
45
|
+
unless no_meter == y
|
46
|
+
|
47
|
+
# If the bare element was a marked enclitic, it must have been
|
48
|
+
# shifted. We're looking for the next metric token, that has it
|
49
|
+
# attached and try to find the string index where it starts to
|
50
|
+
# slice it of.
|
51
|
+
# Usually the metric element just scanned (y) will have it, if we
|
52
|
+
# don't find it, a double shift has occured and it should sit right
|
53
|
+
# at the current element of the metric ArrayScanner (m).
|
54
|
+
# The enclitic (sliced of x) has to be inserted one position before.
|
55
|
+
if @marked_enclitics.include?(y)
|
56
|
+
clean_encl_re = /#{y.dup.delete(@marker)}$/
|
57
|
+
unless index = no_meter =~ clean_encl_re
|
58
|
+
x = m.current
|
59
|
+
index = wo_meter(x) =~ clean_encl_re
|
60
|
+
end
|
61
|
+
insert!(slice_encl!(x, index), m.pos - 1)
|
62
|
+
|
63
|
+
# If the dequantified metric element has an enclitic attached, the
|
64
|
+
# option shifting: false must have been given. The enclitic will
|
65
|
+
# follow right after in the @bare_text, we can therefore slice and
|
66
|
+
# insert right in place (the next # scan round will reveal that
|
67
|
+
# enclitic in metric_text == enclitic in bare_text
|
68
|
+
elsif encl = ENCLITICS.find { |e| no_meter.end_with?(e) }
|
69
|
+
index = no_meter =~ /#{encl}$/
|
70
|
+
insert!(slice_encl!(x, index), m.pos)
|
71
|
+
|
72
|
+
# If the bare element has a dot attached, it must have been an
|
73
|
+
# abbreviation.
|
74
|
+
# The . will appear right afterwards in the metric text. We can
|
75
|
+
# delete it and append it to the last scanned metric element (x)
|
76
|
+
#
|
77
|
+
# We need to do the same if merge words were present.
|
78
|
+
# The last metric element was quam, the bare element is quamdiu.
|
79
|
+
# We append if the last metric element + the next metric element
|
80
|
+
# is the same as the bare element.
|
81
|
+
elsif y.end_with?('.') || merged_words_present?(no_meter, y, m)
|
82
|
+
append_from_deleted_index!(x, m.pos)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
break if b.eoa?
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def insert!(enclitic, position)
|
90
|
+
@metric_text.insert(position, "#{@marker}#{enclitic}")
|
91
|
+
end
|
92
|
+
|
93
|
+
def slice_encl!(token, index)
|
94
|
+
token.slice!(index..-1)
|
95
|
+
end
|
96
|
+
|
97
|
+
def append_from_deleted_index!(token, index)
|
98
|
+
token << @metric_text.delete_at(index)
|
99
|
+
end
|
100
|
+
|
101
|
+
def merged_words_present?(last_metric, last_bare, metric_arr_scanner)
|
102
|
+
(last_metric + wo_meter(metric_arr_scanner.peek)) == last_bare
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|