tokenizer 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.rdoc +4 -4
- data/lib/tokenizer/tokenizer.rb +10 -2
- data/lib/tokenizer/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6d33919dbe66ccc9e7bb95e6239b4da8d9442f4d
|
4
|
+
data.tar.gz: 4eaf400b1811648d1c22ea993a981c5169af09af
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0fcd6304e8e967364756b5e2f2c02ecdec49e7563f1311fb695d29a3aa84640ec57d79db4e4a0581eec0a5363bc8e92750649ce859ad0f8304734274f896da1d
|
7
|
+
data.tar.gz: 2d34cf0948e0c5fab4b76730f8adcac4b7a5ee78ec90c637de2a9de1bdad234dc95fbef264ad941657c13b6ebc9aa53c8ee7485bcc06a2c47caf697fb5e3cab0
|
data/README.rdoc
CHANGED
@@ -11,7 +11,7 @@
|
|
11
11
|
{<img src="https://img.shields.io/gemnasium/arbox/tokenizer.svg" alt="Dependency Status" />}[https://gemnasium.com/arbox/tokenizer]
|
12
12
|
|
13
13
|
== DESCRIPTION
|
14
|
-
A simple multilingual tokenizer -- a linguistic tool intended to split a text
|
14
|
+
A simple multilingual tokenizer -- a linguistic tool intended to split a written text
|
15
15
|
into tokens for NLP tasks. This tool provides a CLI and a library for
|
16
16
|
linguistic tokenization which is an anavoidable step for many HLT (Human
|
17
17
|
Language Technology) tasks in the preprocessing phase for further syntactic,
|
@@ -45,17 +45,17 @@ You can use +Tokenizer+ in two ways.
|
|
45
45
|
|
46
46
|
* As a library for embedded tokenization:
|
47
47
|
> require 'tokenizer'
|
48
|
-
> de_tokenizer = Tokenizer::
|
48
|
+
> de_tokenizer = Tokenizer::WhitespaceTokenizer.new
|
49
49
|
> de_tokenizer.tokenize('Ich gehe in die Schule!')
|
50
50
|
> => ["Ich", "gehe", "in", "die", "Schule", "!"]
|
51
51
|
|
52
52
|
* Customizable PRE and POST list
|
53
53
|
> require 'tokenizer'
|
54
|
-
> de_tokenizer = Tokenizer::
|
54
|
+
> de_tokenizer = Tokenizer::WhitespaceTokenizer.new(:de, { post: Tokenizer::Tokenizer::POST + ['|'] })
|
55
55
|
> de_tokenizer.tokenize('Ich gehe|in die Schule!')
|
56
56
|
> => ["Ich", "gehe", "|in", "die", "Schule", "!"]
|
57
57
|
|
58
|
-
See documentation in the Tokenizer::
|
58
|
+
See documentation in the Tokenizer::WhitespaceTokenizer class for details
|
59
59
|
on particular methods.
|
60
60
|
|
61
61
|
== SUPPORT
|
data/lib/tokenizer/tokenizer.rb
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
# A namespace for all project related stuff.
|
6
6
|
module Tokenizer
|
7
7
|
# Simple whitespace based tokenizer with configurable punctuation detection.
|
8
|
-
class
|
8
|
+
class WhitespaceTokenizer
|
9
9
|
# Default whitespace separator.
|
10
10
|
FS = Regexp.new('[[:blank:]]+')
|
11
11
|
|
@@ -64,10 +64,18 @@ module Tokenizer
|
|
64
64
|
|
65
65
|
private
|
66
66
|
|
67
|
-
# @param [String] User defined string to be tokenized.
|
67
|
+
# @param [String] str User defined string to be tokenized.
|
68
68
|
# @return [String] A new modified string.
|
69
69
|
def sanitize_input(str)
|
70
70
|
str.chomp.strip
|
71
71
|
end
|
72
72
|
end # class
|
73
|
+
|
74
|
+
# @deprecated Use {WhitespaceTokenizer} instead.
|
75
|
+
class Tokenizer < WhitespaceTokenizer
|
76
|
+
def initialize(*args)
|
77
|
+
warn '[Deprecated!] Use WhitespaceTokenizer instead.'
|
78
|
+
super(*args)
|
79
|
+
end
|
80
|
+
end
|
73
81
|
end # module
|
data/lib/tokenizer/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrei Beliankou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-01-
|
11
|
+
date: 2016-01-20 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: A simple multilingual tokenizer for NLP tasks. This tool provides a CLI
|
14
14
|
and a library for linguistic tokenization which is an anavoidable step for many
|