tokenizer 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: abb8db238956f4cbc491e75ec47f7994e572df1d
4
- data.tar.gz: 4d69f5ae6fe9c6411b45946098700034c01152fe
3
+ metadata.gz: f536183270ebc769890d1210adfd255a4df593a1
4
+ data.tar.gz: 4396fc6566fe0703e326a6c135d644c071fd4f28
5
5
  SHA512:
6
- metadata.gz: fcf399eb94f200fa1a682dc64193fcb3bef391e2db7eece38f1603181d139368a2036e14058b59c26b75f4643517f10c5f4508a27830561ed197bfa941fa4ad3
7
- data.tar.gz: 12f59222b26ec7987f971679b8d7be32fc86468b0741afed1b8607eb3049902082d93c614ce83358e6777a61fa4326c4bd11d5ebc9f12d751eb4fb11d678a8b7
6
+ metadata.gz: e14294be0b8ac2a341bab0dea72196e76e82a8192432694dcca0a1456d86290f48ba332b56a19a0401b38360643734c99333ef2e70c7a2a9d1801b1c7618a9bf
7
+ data.tar.gz: d603d33571f3ae9b1a1721784ad5da0e531acd763811989ef34d91b453713b3558bea370a0ca0c13a5c8fef5a5d0ac0a9ab71a2e3be7ac6b47a328624df67d82
data/.yardopts CHANGED
@@ -1,8 +1,10 @@
1
1
  --private
2
2
  --protected
3
3
  --title 'A simple tokenizer for NLP tasks.'
4
+ --main README.rdoc
4
5
  -
5
-
6
6
  CHANGELOG.rdoc
7
+ README.rdoc
7
8
  LICENSE.rdoc
8
9
  bin/*
10
+ lib/**/*
@@ -44,16 +44,16 @@ You can use +Tokenizer+ in two ways.
44
44
  $ echo 'Hi, ich gehe in die Schule!. | tokenize
45
45
 
46
46
  * As a library for embedded tokenization:
47
- $ require 'tokenizer'
48
- $ de_tokenizer = Tokenizer::Tokenizer.new
49
- $ de_tokenizer.tokenize('Ich gehe in die Schule!')
50
- $ => ["Ich", "gehe", "in", "die", "Schule", "!"]
47
+ > require 'tokenizer'
48
+ > de_tokenizer = Tokenizer::Tokenizer.new
49
+ > de_tokenizer.tokenize('Ich gehe in die Schule!')
50
+ > => ["Ich", "gehe", "in", "die", "Schule", "!"]
51
51
 
52
52
  * Customizable PRE and POST list
53
- $ require 'tokenizer'
54
- $ de_tokenizer = Tokenizer::Tokenizer.new(:de, { POST: Tokenizer::Tokenizer::POST + ['|'] })
55
- $ de_tokenizer.tokenize('Ich gehe|in die Schule!')
56
- $ => ["Ich", "gehe", "|in", "die", "Schule", "!"]
53
+ > require 'tokenizer'
54
+ > de_tokenizer = Tokenizer::Tokenizer.new(:de, { post: Tokenizer::Tokenizer::POST + ['|'] })
55
+ > de_tokenizer.tokenize('Ich gehe|in die Schule!')
56
+ > => ["Ich", "gehe", "|in", "die", "Schule", "!"]
57
57
 
58
58
  See documentation in the Tokenizer::Tokenizer class for details
59
59
  on particular methods.
@@ -1,10 +1,9 @@
1
1
  #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
2
 
4
3
  require 'tokenizer'
5
4
 
6
- de_tokenizer = Tokenizer::Tokenizer.new
5
+ tokenizer = Tokenizer::Tokenizer.new
7
6
 
8
- while record = gets
9
- print de_tokenizer.tokenize(record).join("\n")
7
+ while (line = gets)
8
+ puts tokenizer.tokenize(line).join("\n")
10
9
  end
@@ -4,19 +4,34 @@
4
4
 
5
5
  # A namespace for all project related stuff.
6
6
  module Tokenizer
7
+ # Simple whitespace based tokenizer with configurable punctuation detection.
7
8
  class Tokenizer
9
+ # Default whitespace separator.
8
10
  FS = Regexp.new('[[:blank:]]+')
9
11
 
10
- # spanish marks
11
- SIMPLE_PRE = []
12
- PAIR_PRE = ['(', '{', '[', '<']
12
+ # Characters only in the role of splittable prefixes.
13
+ SIMPLE_PRE = ['¿', '¡']
14
+
15
+ # Characters only in the role of splittable suffixes.
13
16
  SIMPLE_POST = ['!', '?', ',', ':', ';', '.']
14
- PAIR_POST = [')', '}', ']', '>']
17
+
18
+ # Characters as splittable prefixes with an optional matching suffix.
19
+ PAIR_PRE = ['(', '{', '[', '<', '«', '„']
20
+
21
+ # Characters as splittable suffixes with an optional matching prefix.
22
+ PAIR_POST = [')', '}', ']', '>', '»', '“']
23
+
24
+ # Characters which can be both prefixes AND suffixes.
15
25
  PRE_N_POST = ['"', "'"]
16
26
 
17
- PRE = SIMPLE_PRE + PAIR_PRE
18
- POST = SIMPLE_POST + PAIR_POST
27
+ private_constant :FS
19
28
 
29
+ # @param [Symbol] lang Language identifier.
30
+ # @param [Hash] options Additional options.
31
+ # @option options [Array] :pre Array of splittable prefix characters.
32
+ # @option options [Array] :post Array of splittable suffix characters.
33
+ # @option options [Array] :pre_n_post Array of characters with
34
+ # suffix AND prefix functions.
20
35
  def initialize(lang = :de, options = {})
21
36
  @lang = lang
22
37
  @options = {
@@ -26,39 +41,33 @@ module Tokenizer
26
41
  }.merge(options)
27
42
  end
28
43
 
44
+ # @param [String] str String to be tokenized.
45
+ # @return [Array<String>] Array of tokens.
29
46
  def tokenize(str)
30
- output = ''
47
+ tokens = sanitize_input(str).split(FS)
48
+ return [''] if tokens.empty?
31
49
 
32
- fields = str.chomp.split(FS)
50
+ splittables = SIMPLE_PRE + SIMPLE_POST + PAIR_PRE + PAIR_POST + PRE_N_POST
51
+ pattern = Regexp.new("[^#{Regexp.escape(splittables.join)}]+")
52
+ output = []
53
+ tokens.each do |token|
54
+ prefix, stem, suffix = token.partition(pattern)
55
+ output << prefix.split('') unless prefix.empty?
56
+ output << stem unless stem.empty?
57
+ output << suffix.split('') unless suffix.empty?
58
+ end
33
59
 
34
- return [''] if fields.empty?
60
+ output.flatten
61
+ end
35
62
 
36
- fields.each do |field|
37
- field.each_char.with_index do |ch, idx|
38
- case
39
- when @options[:pre].include?(ch)
40
- output << "#{ch}\n"
41
- when @options[:post].include?(ch)
42
- output << "\n#{ch}"
43
- if ['?', '!', '.'].include?(ch)
44
- output << "\n"
45
- end
46
- when @options[:pre_n_post].include?(ch)
47
- if idx == 0
48
- output << "#{ch}\n"
49
- elsif idx != 0
50
- output << "\n#{ch}"
51
- end
52
- else
53
- output << ch
54
- end
55
- end
63
+ alias process tokenize
56
64
 
57
- output << "\n"
58
- end
65
+ private
59
66
 
60
- # @TODO: Rework the format of the string!
61
- output.chomp('').split("\n", -1)
67
+ # @param [String] User defined string to be tokenized.
68
+ # @return [String] A new modified string.
69
+ def sanitize_input(str)
70
+ str.chomp.strip
62
71
  end
63
72
  end # class
64
73
  end # module
@@ -1,3 +1,3 @@
1
1
  module Tokenizer
2
- VERSION = '0.1.2'
2
+ VERSION = '0.2.0'
3
3
  end
@@ -1,10 +1,12 @@
1
+ # coding: utf-8
1
2
  require 'minitest/autorun'
3
+ require 'minitest/spec'
2
4
  require 'tokenizer'
3
5
 
4
6
  class TestTokenizer < Minitest::Test
5
7
 
6
8
  def setup
7
- @de_tokenizer = Tokenizer::Tokenizer.new(:de)
9
+ @t = Tokenizer::Tokenizer.new(:de)
8
10
  end
9
11
 
10
12
  def test_constants
@@ -12,14 +14,30 @@ class TestTokenizer < Minitest::Test
12
14
  end
13
15
 
14
16
  def test_output_type
15
- output = @de_tokenizer.tokenize('ich gehe in die Schule')
17
+ output = @t.tokenize('ich gehe in die Schule')
16
18
  assert(output.is_a?(Array))
17
19
  end
18
20
 
19
21
  def test_tokenization_001
20
22
  input = 'Ich ging in die Schule!'
21
23
  etalon = %w(Ich ging in die Schule !)
22
- output = @de_tokenizer.tokenize(input)
24
+ output = @t.tokenize(input)
23
25
  assert_equal(etalon, output)
24
26
  end
27
+
28
+ def test_tokenization_002
29
+ input = '" Es ist wirklich schwer zu sagen , welche Positionen er einnimmt , da er sich noch nicht konkret geäußert hat " , beklagen Volkswirte .'
30
+ etalon = %w(" Es ist wirklich schwer zu sagen , welche Positionen er einnimmt , da er sich noch nicht konkret geäußert hat " , beklagen Volkswirte .)
31
+ output = @t.tokenize(input)
32
+ assert_equal(etalon, output)
33
+ end
34
+ end
35
+
36
+ describe Tokenizer do
37
+ describe 'empty input' do
38
+ it 'should return an Array with an empty string' do
39
+ tokens = Tokenizer::Tokenizer.new.tokenize('')
40
+ tokens.must_equal([''])
41
+ end
42
+ end
25
43
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrei Beliankou
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-09-03 00:00:00.000000000 Z
11
+ date: 2016-01-11 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: A simple multilingual tokenizer for NLP tasks. This tool provides a CLI
14
14
  and a library for linguistic tokenization which is an anavoidable step for many
@@ -41,7 +41,8 @@ files:
41
41
  - test/development_tests/test_ru_tokenizer_dev.rb
42
42
  - test/regression_tests/test_de_tokenizer.rb
43
43
  homepage: https://github.com/arbox/tokenizer
44
- licenses: []
44
+ licenses:
45
+ - MIT
45
46
  metadata: {}
46
47
  post_install_message:
47
48
  rdoc_options: []