tokenizer 0.1.2 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +3 -1
- data/README.rdoc +8 -8
- data/bin/tokenize +3 -4
- data/lib/tokenizer/tokenizer.rb +42 -33
- data/lib/tokenizer/version.rb +1 -1
- data/test/regression_tests/test_de_tokenizer.rb +21 -3
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f536183270ebc769890d1210adfd255a4df593a1
|
4
|
+
data.tar.gz: 4396fc6566fe0703e326a6c135d644c071fd4f28
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e14294be0b8ac2a341bab0dea72196e76e82a8192432694dcca0a1456d86290f48ba332b56a19a0401b38360643734c99333ef2e70c7a2a9d1801b1c7618a9bf
|
7
|
+
data.tar.gz: d603d33571f3ae9b1a1721784ad5da0e531acd763811989ef34d91b453713b3558bea370a0ca0c13a5c8fef5a5d0ac0a9ab71a2e3be7ac6b47a328624df67d82
|
data/.yardopts
CHANGED
data/README.rdoc
CHANGED
@@ -44,16 +44,16 @@ You can use +Tokenizer+ in two ways.
|
|
44
44
|
$ echo 'Hi, ich gehe in die Schule!. | tokenize
|
45
45
|
|
46
46
|
* As a library for embedded tokenization:
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
47
|
+
> require 'tokenizer'
|
48
|
+
> de_tokenizer = Tokenizer::Tokenizer.new
|
49
|
+
> de_tokenizer.tokenize('Ich gehe in die Schule!')
|
50
|
+
> => ["Ich", "gehe", "in", "die", "Schule", "!"]
|
51
51
|
|
52
52
|
* Customizable PRE and POST list
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
53
|
+
> require 'tokenizer'
|
54
|
+
> de_tokenizer = Tokenizer::Tokenizer.new(:de, { post: Tokenizer::Tokenizer::POST + ['|'] })
|
55
|
+
> de_tokenizer.tokenize('Ich gehe|in die Schule!')
|
56
|
+
> => ["Ich", "gehe", "|in", "die", "Schule", "!"]
|
57
57
|
|
58
58
|
See documentation in the Tokenizer::Tokenizer class for details
|
59
59
|
on particular methods.
|
data/bin/tokenize
CHANGED
@@ -1,10 +1,9 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
|
4
3
|
require 'tokenizer'
|
5
4
|
|
6
|
-
|
5
|
+
tokenizer = Tokenizer::Tokenizer.new
|
7
6
|
|
8
|
-
while
|
9
|
-
|
7
|
+
while (line = gets)
|
8
|
+
puts tokenizer.tokenize(line).join("\n")
|
10
9
|
end
|
data/lib/tokenizer/tokenizer.rb
CHANGED
@@ -4,19 +4,34 @@
|
|
4
4
|
|
5
5
|
# A namespace for all project related stuff.
|
6
6
|
module Tokenizer
|
7
|
+
# Simple whitespace based tokenizer with configurable punctuation detection.
|
7
8
|
class Tokenizer
|
9
|
+
# Default whitespace separator.
|
8
10
|
FS = Regexp.new('[[:blank:]]+')
|
9
11
|
|
10
|
-
#
|
11
|
-
SIMPLE_PRE = []
|
12
|
-
|
12
|
+
# Characters only in the role of splittable prefixes.
|
13
|
+
SIMPLE_PRE = ['¿', '¡']
|
14
|
+
|
15
|
+
# Characters only in the role of splittable suffixes.
|
13
16
|
SIMPLE_POST = ['!', '?', ',', ':', ';', '.']
|
14
|
-
|
17
|
+
|
18
|
+
# Characters as splittable prefixes with an optional matching suffix.
|
19
|
+
PAIR_PRE = ['(', '{', '[', '<', '«', '„']
|
20
|
+
|
21
|
+
# Characters as splittable suffixes with an optional matching prefix.
|
22
|
+
PAIR_POST = [')', '}', ']', '>', '»', '“']
|
23
|
+
|
24
|
+
# Characters which can be both prefixes AND suffixes.
|
15
25
|
PRE_N_POST = ['"', "'"]
|
16
26
|
|
17
|
-
|
18
|
-
POST = SIMPLE_POST + PAIR_POST
|
27
|
+
private_constant :FS
|
19
28
|
|
29
|
+
# @param [Symbol] lang Language identifier.
|
30
|
+
# @param [Hash] options Additional options.
|
31
|
+
# @option options [Array] :pre Array of splittable prefix characters.
|
32
|
+
# @option options [Array] :post Array of splittable suffix characters.
|
33
|
+
# @option options [Array] :pre_n_post Array of characters with
|
34
|
+
# suffix AND prefix functions.
|
20
35
|
def initialize(lang = :de, options = {})
|
21
36
|
@lang = lang
|
22
37
|
@options = {
|
@@ -26,39 +41,33 @@ module Tokenizer
|
|
26
41
|
}.merge(options)
|
27
42
|
end
|
28
43
|
|
44
|
+
# @param [String] str String to be tokenized.
|
45
|
+
# @return [Array<String>] Array of tokens.
|
29
46
|
def tokenize(str)
|
30
|
-
|
47
|
+
tokens = sanitize_input(str).split(FS)
|
48
|
+
return [''] if tokens.empty?
|
31
49
|
|
32
|
-
|
50
|
+
splittables = SIMPLE_PRE + SIMPLE_POST + PAIR_PRE + PAIR_POST + PRE_N_POST
|
51
|
+
pattern = Regexp.new("[^#{Regexp.escape(splittables.join)}]+")
|
52
|
+
output = []
|
53
|
+
tokens.each do |token|
|
54
|
+
prefix, stem, suffix = token.partition(pattern)
|
55
|
+
output << prefix.split('') unless prefix.empty?
|
56
|
+
output << stem unless stem.empty?
|
57
|
+
output << suffix.split('') unless suffix.empty?
|
58
|
+
end
|
33
59
|
|
34
|
-
|
60
|
+
output.flatten
|
61
|
+
end
|
35
62
|
|
36
|
-
|
37
|
-
field.each_char.with_index do |ch, idx|
|
38
|
-
case
|
39
|
-
when @options[:pre].include?(ch)
|
40
|
-
output << "#{ch}\n"
|
41
|
-
when @options[:post].include?(ch)
|
42
|
-
output << "\n#{ch}"
|
43
|
-
if ['?', '!', '.'].include?(ch)
|
44
|
-
output << "\n"
|
45
|
-
end
|
46
|
-
when @options[:pre_n_post].include?(ch)
|
47
|
-
if idx == 0
|
48
|
-
output << "#{ch}\n"
|
49
|
-
elsif idx != 0
|
50
|
-
output << "\n#{ch}"
|
51
|
-
end
|
52
|
-
else
|
53
|
-
output << ch
|
54
|
-
end
|
55
|
-
end
|
63
|
+
alias process tokenize
|
56
64
|
|
57
|
-
|
58
|
-
end
|
65
|
+
private
|
59
66
|
|
60
|
-
|
61
|
-
|
67
|
+
# @param [String] User defined string to be tokenized.
|
68
|
+
# @return [String] A new modified string.
|
69
|
+
def sanitize_input(str)
|
70
|
+
str.chomp.strip
|
62
71
|
end
|
63
72
|
end # class
|
64
73
|
end # module
|
data/lib/tokenizer/version.rb
CHANGED
@@ -1,10 +1,12 @@
|
|
1
|
+
# coding: utf-8
|
1
2
|
require 'minitest/autorun'
|
3
|
+
require 'minitest/spec'
|
2
4
|
require 'tokenizer'
|
3
5
|
|
4
6
|
class TestTokenizer < Minitest::Test
|
5
7
|
|
6
8
|
def setup
|
7
|
-
@
|
9
|
+
@t = Tokenizer::Tokenizer.new(:de)
|
8
10
|
end
|
9
11
|
|
10
12
|
def test_constants
|
@@ -12,14 +14,30 @@ class TestTokenizer < Minitest::Test
|
|
12
14
|
end
|
13
15
|
|
14
16
|
def test_output_type
|
15
|
-
output = @
|
17
|
+
output = @t.tokenize('ich gehe in die Schule')
|
16
18
|
assert(output.is_a?(Array))
|
17
19
|
end
|
18
20
|
|
19
21
|
def test_tokenization_001
|
20
22
|
input = 'Ich ging in die Schule!'
|
21
23
|
etalon = %w(Ich ging in die Schule !)
|
22
|
-
output = @
|
24
|
+
output = @t.tokenize(input)
|
23
25
|
assert_equal(etalon, output)
|
24
26
|
end
|
27
|
+
|
28
|
+
def test_tokenization_002
|
29
|
+
input = '" Es ist wirklich schwer zu sagen , welche Positionen er einnimmt , da er sich noch nicht konkret geäußert hat " , beklagen Volkswirte .'
|
30
|
+
etalon = %w(" Es ist wirklich schwer zu sagen , welche Positionen er einnimmt , da er sich noch nicht konkret geäußert hat " , beklagen Volkswirte .)
|
31
|
+
output = @t.tokenize(input)
|
32
|
+
assert_equal(etalon, output)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
describe Tokenizer do
|
37
|
+
describe 'empty input' do
|
38
|
+
it 'should return an Array with an empty string' do
|
39
|
+
tokens = Tokenizer::Tokenizer.new.tokenize('')
|
40
|
+
tokens.must_equal([''])
|
41
|
+
end
|
42
|
+
end
|
25
43
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrei Beliankou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-01-11 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: A simple multilingual tokenizer for NLP tasks. This tool provides a CLI
|
14
14
|
and a library for linguistic tokenization which is an anavoidable step for many
|
@@ -41,7 +41,8 @@ files:
|
|
41
41
|
- test/development_tests/test_ru_tokenizer_dev.rb
|
42
42
|
- test/regression_tests/test_de_tokenizer.rb
|
43
43
|
homepage: https://github.com/arbox/tokenizer
|
44
|
-
licenses:
|
44
|
+
licenses:
|
45
|
+
- MIT
|
45
46
|
metadata: {}
|
46
47
|
post_install_message:
|
47
48
|
rdoc_options: []
|