tokenizer 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +3 -1
- data/README.rdoc +8 -8
- data/bin/tokenize +3 -4
- data/lib/tokenizer/tokenizer.rb +42 -33
- data/lib/tokenizer/version.rb +1 -1
- data/test/regression_tests/test_de_tokenizer.rb +21 -3
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f536183270ebc769890d1210adfd255a4df593a1
|
4
|
+
data.tar.gz: 4396fc6566fe0703e326a6c135d644c071fd4f28
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e14294be0b8ac2a341bab0dea72196e76e82a8192432694dcca0a1456d86290f48ba332b56a19a0401b38360643734c99333ef2e70c7a2a9d1801b1c7618a9bf
|
7
|
+
data.tar.gz: d603d33571f3ae9b1a1721784ad5da0e531acd763811989ef34d91b453713b3558bea370a0ca0c13a5c8fef5a5d0ac0a9ab71a2e3be7ac6b47a328624df67d82
|
data/.yardopts
CHANGED
data/README.rdoc
CHANGED
@@ -44,16 +44,16 @@ You can use +Tokenizer+ in two ways.
|
|
44
44
|
$ echo 'Hi, ich gehe in die Schule!. | tokenize
|
45
45
|
|
46
46
|
* As a library for embedded tokenization:
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
47
|
+
> require 'tokenizer'
|
48
|
+
> de_tokenizer = Tokenizer::Tokenizer.new
|
49
|
+
> de_tokenizer.tokenize('Ich gehe in die Schule!')
|
50
|
+
> => ["Ich", "gehe", "in", "die", "Schule", "!"]
|
51
51
|
|
52
52
|
* Customizable PRE and POST list
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
53
|
+
> require 'tokenizer'
|
54
|
+
> de_tokenizer = Tokenizer::Tokenizer.new(:de, { post: Tokenizer::Tokenizer::POST + ['|'] })
|
55
|
+
> de_tokenizer.tokenize('Ich gehe|in die Schule!')
|
56
|
+
> => ["Ich", "gehe", "|in", "die", "Schule", "!"]
|
57
57
|
|
58
58
|
See documentation in the Tokenizer::Tokenizer class for details
|
59
59
|
on particular methods.
|
data/bin/tokenize
CHANGED
@@ -1,10 +1,9 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
|
4
3
|
require 'tokenizer'
|
5
4
|
|
6
|
-
|
5
|
+
tokenizer = Tokenizer::Tokenizer.new
|
7
6
|
|
8
|
-
while
|
9
|
-
|
7
|
+
while (line = gets)
|
8
|
+
puts tokenizer.tokenize(line).join("\n")
|
10
9
|
end
|
data/lib/tokenizer/tokenizer.rb
CHANGED
@@ -4,19 +4,34 @@
|
|
4
4
|
|
5
5
|
# A namespace for all project related stuff.
|
6
6
|
module Tokenizer
|
7
|
+
# Simple whitespace based tokenizer with configurable punctuation detection.
|
7
8
|
class Tokenizer
|
9
|
+
# Default whitespace separator.
|
8
10
|
FS = Regexp.new('[[:blank:]]+')
|
9
11
|
|
10
|
-
#
|
11
|
-
SIMPLE_PRE = []
|
12
|
-
|
12
|
+
# Characters only in the role of splittable prefixes.
|
13
|
+
SIMPLE_PRE = ['¿', '¡']
|
14
|
+
|
15
|
+
# Characters only in the role of splittable suffixes.
|
13
16
|
SIMPLE_POST = ['!', '?', ',', ':', ';', '.']
|
14
|
-
|
17
|
+
|
18
|
+
# Characters as splittable prefixes with an optional matching suffix.
|
19
|
+
PAIR_PRE = ['(', '{', '[', '<', '«', '„']
|
20
|
+
|
21
|
+
# Characters as splittable suffixes with an optional matching prefix.
|
22
|
+
PAIR_POST = [')', '}', ']', '>', '»', '“']
|
23
|
+
|
24
|
+
# Characters which can be both prefixes AND suffixes.
|
15
25
|
PRE_N_POST = ['"', "'"]
|
16
26
|
|
17
|
-
|
18
|
-
POST = SIMPLE_POST + PAIR_POST
|
27
|
+
private_constant :FS
|
19
28
|
|
29
|
+
# @param [Symbol] lang Language identifier.
|
30
|
+
# @param [Hash] options Additional options.
|
31
|
+
# @option options [Array] :pre Array of splittable prefix characters.
|
32
|
+
# @option options [Array] :post Array of splittable suffix characters.
|
33
|
+
# @option options [Array] :pre_n_post Array of characters with
|
34
|
+
# suffix AND prefix functions.
|
20
35
|
def initialize(lang = :de, options = {})
|
21
36
|
@lang = lang
|
22
37
|
@options = {
|
@@ -26,39 +41,33 @@ module Tokenizer
|
|
26
41
|
}.merge(options)
|
27
42
|
end
|
28
43
|
|
44
|
+
# @param [String] str String to be tokenized.
|
45
|
+
# @return [Array<String>] Array of tokens.
|
29
46
|
def tokenize(str)
|
30
|
-
|
47
|
+
tokens = sanitize_input(str).split(FS)
|
48
|
+
return [''] if tokens.empty?
|
31
49
|
|
32
|
-
|
50
|
+
splittables = SIMPLE_PRE + SIMPLE_POST + PAIR_PRE + PAIR_POST + PRE_N_POST
|
51
|
+
pattern = Regexp.new("[^#{Regexp.escape(splittables.join)}]+")
|
52
|
+
output = []
|
53
|
+
tokens.each do |token|
|
54
|
+
prefix, stem, suffix = token.partition(pattern)
|
55
|
+
output << prefix.split('') unless prefix.empty?
|
56
|
+
output << stem unless stem.empty?
|
57
|
+
output << suffix.split('') unless suffix.empty?
|
58
|
+
end
|
33
59
|
|
34
|
-
|
60
|
+
output.flatten
|
61
|
+
end
|
35
62
|
|
36
|
-
|
37
|
-
field.each_char.with_index do |ch, idx|
|
38
|
-
case
|
39
|
-
when @options[:pre].include?(ch)
|
40
|
-
output << "#{ch}\n"
|
41
|
-
when @options[:post].include?(ch)
|
42
|
-
output << "\n#{ch}"
|
43
|
-
if ['?', '!', '.'].include?(ch)
|
44
|
-
output << "\n"
|
45
|
-
end
|
46
|
-
when @options[:pre_n_post].include?(ch)
|
47
|
-
if idx == 0
|
48
|
-
output << "#{ch}\n"
|
49
|
-
elsif idx != 0
|
50
|
-
output << "\n#{ch}"
|
51
|
-
end
|
52
|
-
else
|
53
|
-
output << ch
|
54
|
-
end
|
55
|
-
end
|
63
|
+
alias process tokenize
|
56
64
|
|
57
|
-
|
58
|
-
end
|
65
|
+
private
|
59
66
|
|
60
|
-
|
61
|
-
|
67
|
+
# @param [String] User defined string to be tokenized.
|
68
|
+
# @return [String] A new modified string.
|
69
|
+
def sanitize_input(str)
|
70
|
+
str.chomp.strip
|
62
71
|
end
|
63
72
|
end # class
|
64
73
|
end # module
|
data/lib/tokenizer/version.rb
CHANGED
@@ -1,10 +1,12 @@
|
|
1
|
+
# coding: utf-8
|
1
2
|
require 'minitest/autorun'
|
3
|
+
require 'minitest/spec'
|
2
4
|
require 'tokenizer'
|
3
5
|
|
4
6
|
class TestTokenizer < Minitest::Test
|
5
7
|
|
6
8
|
def setup
|
7
|
-
@
|
9
|
+
@t = Tokenizer::Tokenizer.new(:de)
|
8
10
|
end
|
9
11
|
|
10
12
|
def test_constants
|
@@ -12,14 +14,30 @@ class TestTokenizer < Minitest::Test
|
|
12
14
|
end
|
13
15
|
|
14
16
|
def test_output_type
|
15
|
-
output = @
|
17
|
+
output = @t.tokenize('ich gehe in die Schule')
|
16
18
|
assert(output.is_a?(Array))
|
17
19
|
end
|
18
20
|
|
19
21
|
def test_tokenization_001
|
20
22
|
input = 'Ich ging in die Schule!'
|
21
23
|
etalon = %w(Ich ging in die Schule !)
|
22
|
-
output = @
|
24
|
+
output = @t.tokenize(input)
|
23
25
|
assert_equal(etalon, output)
|
24
26
|
end
|
27
|
+
|
28
|
+
def test_tokenization_002
|
29
|
+
input = '" Es ist wirklich schwer zu sagen , welche Positionen er einnimmt , da er sich noch nicht konkret geäußert hat " , beklagen Volkswirte .'
|
30
|
+
etalon = %w(" Es ist wirklich schwer zu sagen , welche Positionen er einnimmt , da er sich noch nicht konkret geäußert hat " , beklagen Volkswirte .)
|
31
|
+
output = @t.tokenize(input)
|
32
|
+
assert_equal(etalon, output)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
describe Tokenizer do
|
37
|
+
describe 'empty input' do
|
38
|
+
it 'should return an Array with an empty string' do
|
39
|
+
tokens = Tokenizer::Tokenizer.new.tokenize('')
|
40
|
+
tokens.must_equal([''])
|
41
|
+
end
|
42
|
+
end
|
25
43
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrei Beliankou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-01-11 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: A simple multilingual tokenizer for NLP tasks. This tool provides a CLI
|
14
14
|
and a library for linguistic tokenization which is an anavoidable step for many
|
@@ -41,7 +41,8 @@ files:
|
|
41
41
|
- test/development_tests/test_ru_tokenizer_dev.rb
|
42
42
|
- test/regression_tests/test_de_tokenizer.rb
|
43
43
|
homepage: https://github.com/arbox/tokenizer
|
44
|
-
licenses:
|
44
|
+
licenses:
|
45
|
+
- MIT
|
45
46
|
metadata: {}
|
46
47
|
post_install_message:
|
47
48
|
rdoc_options: []
|