tokenizer 0.1.2 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: abb8db238956f4cbc491e75ec47f7994e572df1d
4
- data.tar.gz: 4d69f5ae6fe9c6411b45946098700034c01152fe
3
+ metadata.gz: f536183270ebc769890d1210adfd255a4df593a1
4
+ data.tar.gz: 4396fc6566fe0703e326a6c135d644c071fd4f28
5
5
  SHA512:
6
- metadata.gz: fcf399eb94f200fa1a682dc64193fcb3bef391e2db7eece38f1603181d139368a2036e14058b59c26b75f4643517f10c5f4508a27830561ed197bfa941fa4ad3
7
- data.tar.gz: 12f59222b26ec7987f971679b8d7be32fc86468b0741afed1b8607eb3049902082d93c614ce83358e6777a61fa4326c4bd11d5ebc9f12d751eb4fb11d678a8b7
6
+ metadata.gz: e14294be0b8ac2a341bab0dea72196e76e82a8192432694dcca0a1456d86290f48ba332b56a19a0401b38360643734c99333ef2e70c7a2a9d1801b1c7618a9bf
7
+ data.tar.gz: d603d33571f3ae9b1a1721784ad5da0e531acd763811989ef34d91b453713b3558bea370a0ca0c13a5c8fef5a5d0ac0a9ab71a2e3be7ac6b47a328624df67d82
data/.yardopts CHANGED
@@ -1,8 +1,10 @@
1
1
  --private
2
2
  --protected
3
3
  --title 'A simple tokenizer for NLP tasks.'
4
+ --main README.rdoc
4
5
  -
5
-
6
6
  CHANGELOG.rdoc
7
+ README.rdoc
7
8
  LICENSE.rdoc
8
9
  bin/*
10
+ lib/**/*
@@ -44,16 +44,16 @@ You can use +Tokenizer+ in two ways.
44
44
  $ echo 'Hi, ich gehe in die Schule!. | tokenize
45
45
 
46
46
  * As a library for embedded tokenization:
47
- $ require 'tokenizer'
48
- $ de_tokenizer = Tokenizer::Tokenizer.new
49
- $ de_tokenizer.tokenize('Ich gehe in die Schule!')
50
- $ => ["Ich", "gehe", "in", "die", "Schule", "!"]
47
+ > require 'tokenizer'
48
+ > de_tokenizer = Tokenizer::Tokenizer.new
49
+ > de_tokenizer.tokenize('Ich gehe in die Schule!')
50
+ > => ["Ich", "gehe", "in", "die", "Schule", "!"]
51
51
 
52
52
  * Customizable PRE and POST list
53
- $ require 'tokenizer'
54
- $ de_tokenizer = Tokenizer::Tokenizer.new(:de, { POST: Tokenizer::Tokenizer::POST + ['|'] })
55
- $ de_tokenizer.tokenize('Ich gehe|in die Schule!')
56
- $ => ["Ich", "gehe", "|in", "die", "Schule", "!"]
53
+ > require 'tokenizer'
54
+ > de_tokenizer = Tokenizer::Tokenizer.new(:de, { post: Tokenizer::Tokenizer::POST + ['|'] })
55
+ > de_tokenizer.tokenize('Ich gehe|in die Schule!')
56
+ > => ["Ich", "gehe", "|in", "die", "Schule", "!"]
57
57
 
58
58
  See documentation in the Tokenizer::Tokenizer class for details
59
59
  on particular methods.
@@ -1,10 +1,9 @@
1
1
  #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
2
 
4
3
  require 'tokenizer'
5
4
 
6
- de_tokenizer = Tokenizer::Tokenizer.new
5
+ tokenizer = Tokenizer::Tokenizer.new
7
6
 
8
- while record = gets
9
- print de_tokenizer.tokenize(record).join("\n")
7
+ while (line = gets)
8
+ puts tokenizer.tokenize(line).join("\n")
10
9
  end
@@ -4,19 +4,34 @@
4
4
 
5
5
  # A namespace for all project related stuff.
6
6
  module Tokenizer
7
+ # Simple whitespace based tokenizer with configurable punctuation detection.
7
8
  class Tokenizer
9
+ # Default whitespace separator.
8
10
  FS = Regexp.new('[[:blank:]]+')
9
11
 
10
- # spanish marks
11
- SIMPLE_PRE = []
12
- PAIR_PRE = ['(', '{', '[', '<']
12
+ # Characters only in the role of splittable prefixes.
13
+ SIMPLE_PRE = ['¿', '¡']
14
+
15
+ # Characters only in the role of splittable suffixes.
13
16
  SIMPLE_POST = ['!', '?', ',', ':', ';', '.']
14
- PAIR_POST = [')', '}', ']', '>']
17
+
18
+ # Characters as splittable prefixes with an optional matching suffix.
19
+ PAIR_PRE = ['(', '{', '[', '<', '«', '„']
20
+
21
+ # Characters as splittable suffixes with an optional matching prefix.
22
+ PAIR_POST = [')', '}', ']', '>', '»', '“']
23
+
24
+ # Characters which can be both prefixes AND suffixes.
15
25
  PRE_N_POST = ['"', "'"]
16
26
 
17
- PRE = SIMPLE_PRE + PAIR_PRE
18
- POST = SIMPLE_POST + PAIR_POST
27
+ private_constant :FS
19
28
 
29
+ # @param [Symbol] lang Language identifier.
30
+ # @param [Hash] options Additional options.
31
+ # @option options [Array] :pre Array of splittable prefix characters.
32
+ # @option options [Array] :post Array of splittable suffix characters.
33
+ # @option options [Array] :pre_n_post Array of characters with
34
+ # suffix AND prefix functions.
20
35
  def initialize(lang = :de, options = {})
21
36
  @lang = lang
22
37
  @options = {
@@ -26,39 +41,33 @@ module Tokenizer
26
41
  }.merge(options)
27
42
  end
28
43
 
44
+ # @param [String] str String to be tokenized.
45
+ # @return [Array<String>] Array of tokens.
29
46
  def tokenize(str)
30
- output = ''
47
+ tokens = sanitize_input(str).split(FS)
48
+ return [''] if tokens.empty?
31
49
 
32
- fields = str.chomp.split(FS)
50
+ splittables = SIMPLE_PRE + SIMPLE_POST + PAIR_PRE + PAIR_POST + PRE_N_POST
51
+ pattern = Regexp.new("[^#{Regexp.escape(splittables.join)}]+")
52
+ output = []
53
+ tokens.each do |token|
54
+ prefix, stem, suffix = token.partition(pattern)
55
+ output << prefix.split('') unless prefix.empty?
56
+ output << stem unless stem.empty?
57
+ output << suffix.split('') unless suffix.empty?
58
+ end
33
59
 
34
- return [''] if fields.empty?
60
+ output.flatten
61
+ end
35
62
 
36
- fields.each do |field|
37
- field.each_char.with_index do |ch, idx|
38
- case
39
- when @options[:pre].include?(ch)
40
- output << "#{ch}\n"
41
- when @options[:post].include?(ch)
42
- output << "\n#{ch}"
43
- if ['?', '!', '.'].include?(ch)
44
- output << "\n"
45
- end
46
- when @options[:pre_n_post].include?(ch)
47
- if idx == 0
48
- output << "#{ch}\n"
49
- elsif idx != 0
50
- output << "\n#{ch}"
51
- end
52
- else
53
- output << ch
54
- end
55
- end
63
+ alias process tokenize
56
64
 
57
- output << "\n"
58
- end
65
+ private
59
66
 
60
- # @TODO: Rework the format of the string!
61
- output.chomp('').split("\n", -1)
67
+ # @param [String] User defined string to be tokenized.
68
+ # @return [String] A new modified string.
69
+ def sanitize_input(str)
70
+ str.chomp.strip
62
71
  end
63
72
  end # class
64
73
  end # module
@@ -1,3 +1,3 @@
1
1
  module Tokenizer
2
- VERSION = '0.1.2'
2
+ VERSION = '0.2.0'
3
3
  end
@@ -1,10 +1,12 @@
1
+ # coding: utf-8
1
2
  require 'minitest/autorun'
3
+ require 'minitest/spec'
2
4
  require 'tokenizer'
3
5
 
4
6
  class TestTokenizer < Minitest::Test
5
7
 
6
8
  def setup
7
- @de_tokenizer = Tokenizer::Tokenizer.new(:de)
9
+ @t = Tokenizer::Tokenizer.new(:de)
8
10
  end
9
11
 
10
12
  def test_constants
@@ -12,14 +14,30 @@ class TestTokenizer < Minitest::Test
12
14
  end
13
15
 
14
16
  def test_output_type
15
- output = @de_tokenizer.tokenize('ich gehe in die Schule')
17
+ output = @t.tokenize('ich gehe in die Schule')
16
18
  assert(output.is_a?(Array))
17
19
  end
18
20
 
19
21
  def test_tokenization_001
20
22
  input = 'Ich ging in die Schule!'
21
23
  etalon = %w(Ich ging in die Schule !)
22
- output = @de_tokenizer.tokenize(input)
24
+ output = @t.tokenize(input)
23
25
  assert_equal(etalon, output)
24
26
  end
27
+
28
+ def test_tokenization_002
29
+ input = '" Es ist wirklich schwer zu sagen , welche Positionen er einnimmt , da er sich noch nicht konkret geäußert hat " , beklagen Volkswirte .'
30
+ etalon = %w(" Es ist wirklich schwer zu sagen , welche Positionen er einnimmt , da er sich noch nicht konkret geäußert hat " , beklagen Volkswirte .)
31
+ output = @t.tokenize(input)
32
+ assert_equal(etalon, output)
33
+ end
34
+ end
35
+
36
+ describe Tokenizer do
37
+ describe 'empty input' do
38
+ it 'should return an Array with an empty string' do
39
+ tokens = Tokenizer::Tokenizer.new.tokenize('')
40
+ tokens.must_equal([''])
41
+ end
42
+ end
25
43
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrei Beliankou
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-09-03 00:00:00.000000000 Z
11
+ date: 2016-01-11 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: A simple multilingual tokenizer for NLP tasks. This tool provides a CLI
14
14
  and a library for linguistic tokenization which is an anavoidable step for many
@@ -41,7 +41,8 @@ files:
41
41
  - test/development_tests/test_ru_tokenizer_dev.rb
42
42
  - test/regression_tests/test_de_tokenizer.rb
43
43
  homepage: https://github.com/arbox/tokenizer
44
- licenses: []
44
+ licenses:
45
+ - MIT
45
46
  metadata: {}
46
47
  post_install_message:
47
48
  rdoc_options: []