tokenizer 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f536183270ebc769890d1210adfd255a4df593a1
4
- data.tar.gz: 4396fc6566fe0703e326a6c135d644c071fd4f28
3
+ metadata.gz: 6d33919dbe66ccc9e7bb95e6239b4da8d9442f4d
4
+ data.tar.gz: 4eaf400b1811648d1c22ea993a981c5169af09af
5
5
  SHA512:
6
- metadata.gz: e14294be0b8ac2a341bab0dea72196e76e82a8192432694dcca0a1456d86290f48ba332b56a19a0401b38360643734c99333ef2e70c7a2a9d1801b1c7618a9bf
7
- data.tar.gz: d603d33571f3ae9b1a1721784ad5da0e531acd763811989ef34d91b453713b3558bea370a0ca0c13a5c8fef5a5d0ac0a9ab71a2e3be7ac6b47a328624df67d82
6
+ metadata.gz: 0fcd6304e8e967364756b5e2f2c02ecdec49e7563f1311fb695d29a3aa84640ec57d79db4e4a0581eec0a5363bc8e92750649ce859ad0f8304734274f896da1d
7
+ data.tar.gz: 2d34cf0948e0c5fab4b76730f8adcac4b7a5ee78ec90c637de2a9de1bdad234dc95fbef264ad941657c13b6ebc9aa53c8ee7485bcc06a2c47caf697fb5e3cab0
@@ -11,7 +11,7 @@
11
11
  {<img src="https://img.shields.io/gemnasium/arbox/tokenizer.svg" alt="Dependency Status" />}[https://gemnasium.com/arbox/tokenizer]
12
12
 
13
13
  == DESCRIPTION
14
- A simple multilingual tokenizer -- a linguistic tool intended to split a text
14
+ A simple multilingual tokenizer -- a linguistic tool intended to split a written text
15
15
  into tokens for NLP tasks. This tool provides a CLI and a library for
16
16
  linguistic tokenization which is an anavoidable step for many HLT (Human
17
17
  Language Technology) tasks in the preprocessing phase for further syntactic,
@@ -45,17 +45,17 @@ You can use +Tokenizer+ in two ways.
45
45
 
46
46
  * As a library for embedded tokenization:
47
47
  > require 'tokenizer'
48
- > de_tokenizer = Tokenizer::Tokenizer.new
48
+ > de_tokenizer = Tokenizer::WhitespaceTokenizer.new
49
49
  > de_tokenizer.tokenize('Ich gehe in die Schule!')
50
50
  > => ["Ich", "gehe", "in", "die", "Schule", "!"]
51
51
 
52
52
  * Customizable PRE and POST list
53
53
  > require 'tokenizer'
54
- > de_tokenizer = Tokenizer::Tokenizer.new(:de, { post: Tokenizer::Tokenizer::POST + ['|'] })
54
+ > de_tokenizer = Tokenizer::WhitespaceTokenizer.new(:de, { post: Tokenizer::Tokenizer::POST + ['|'] })
55
55
  > de_tokenizer.tokenize('Ich gehe|in die Schule!')
56
56
  > => ["Ich", "gehe", "|in", "die", "Schule", "!"]
57
57
 
58
- See documentation in the Tokenizer::Tokenizer class for details
58
+ See documentation in the Tokenizer::WhitespaceTokenizer class for details
59
59
  on particular methods.
60
60
 
61
61
  == SUPPORT
@@ -5,7 +5,7 @@
5
5
  # A namespace for all project related stuff.
6
6
  module Tokenizer
7
7
  # Simple whitespace based tokenizer with configurable punctuation detection.
8
- class Tokenizer
8
+ class WhitespaceTokenizer
9
9
  # Default whitespace separator.
10
10
  FS = Regexp.new('[[:blank:]]+')
11
11
 
@@ -64,10 +64,18 @@ module Tokenizer
64
64
 
65
65
  private
66
66
 
67
- # @param [String] User defined string to be tokenized.
67
+ # @param [String] str User defined string to be tokenized.
68
68
  # @return [String] A new modified string.
69
69
  def sanitize_input(str)
70
70
  str.chomp.strip
71
71
  end
72
72
  end # class
73
+
74
+ # @deprecated Use {WhitespaceTokenizer} instead.
75
+ class Tokenizer < WhitespaceTokenizer
76
+ def initialize(*args)
77
+ warn '[Deprecated!] Use WhitespaceTokenizer instead.'
78
+ super(*args)
79
+ end
80
+ end
73
81
  end # module
@@ -1,3 +1,3 @@
1
1
  module Tokenizer
2
- VERSION = '0.2.0'
2
+ VERSION = '0.3.0'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrei Beliankou
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-01-11 00:00:00.000000000 Z
11
+ date: 2016-01-20 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: A simple multilingual tokenizer for NLP tasks. This tool provides a CLI
14
14
  and a library for linguistic tokenization which is an anavoidable step for many