textoken 1.1.0 → 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/textoken/options/exclude.rb +3 -2
- data/lib/textoken/options/less_than.rb +5 -6
- data/lib/textoken/options/modules/conditional_option.rb +4 -0
- data/lib/textoken/options/modules/numeric_option.rb +7 -3
- data/lib/textoken/options/modules/tokenizable_option.rb +18 -0
- data/lib/textoken/options/more_than.rb +5 -6
- data/lib/textoken/options/only.rb +3 -2
- data/lib/textoken/version.rb +1 -1
- data/lib/textoken.rb +1 -0
- metadata +7 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e530e61af20068c0e361856680b8df0ce9ace2e9
|
4
|
+
data.tar.gz: a94c443c5db50d0270916e8966c212e0321e8579
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 923e3261959c09b20f497f6b5d24a0495220265a23e23cd583f55db7d4eba65f5fe441374d4d11d56fc4a3e5856bb5baeda83c3c0f4994b522659121e207bd4e
|
7
|
+
data.tar.gz: 6ccac85435c7192ee2c83bfbc49d2efcff4e8fb695a4e78af5e35d7083ccc4ac4d13930ab9766d906fe41b98f4a50552a9a2aad62fe76a8f66e7cd925bfb6603
|
@@ -4,11 +4,12 @@ module Textoken
|
|
4
4
|
class Exclude
|
5
5
|
include ConditionalOption
|
6
6
|
|
7
|
+
private
|
8
|
+
|
7
9
|
# base.text is raw tokens splitted with ' '
|
8
10
|
# values are Regexps array to search
|
9
11
|
# base.findings, Findings object for pushing matching tokens
|
10
|
-
def
|
11
|
-
@base = base
|
12
|
+
def tokenize_condition
|
12
13
|
tokenize_if { |word, regexp| !word.match(regexp) }
|
13
14
|
end
|
14
15
|
end
|
@@ -4,15 +4,14 @@ module Textoken
|
|
4
4
|
class LessThan
|
5
5
|
include NumericOption
|
6
6
|
|
7
|
-
|
8
|
-
|
7
|
+
private
|
8
|
+
|
9
|
+
def tokenize_condition
|
9
10
|
tokenize_if { |word| word.length < number }
|
10
11
|
end
|
11
12
|
|
12
|
-
|
13
|
-
|
14
|
-
def validate_option_value(value)
|
15
|
-
validate { value.class == Fixnum && value > 1 }
|
13
|
+
def validate_option_value
|
14
|
+
validate { |value| value > 1 }
|
16
15
|
end
|
17
16
|
end
|
18
17
|
end
|
@@ -1,6 +1,8 @@
|
|
1
1
|
module Textoken
|
2
2
|
# This module will be shared in options like, only and exclude
|
3
3
|
module ConditionalOption
|
4
|
+
include TokenizableOption
|
5
|
+
|
4
6
|
attr_reader :regexps, :findings, :base
|
5
7
|
|
6
8
|
def priority
|
@@ -12,6 +14,8 @@ module Textoken
|
|
12
14
|
@findings = Findings.new
|
13
15
|
end
|
14
16
|
|
17
|
+
private
|
18
|
+
|
15
19
|
def tokenize_if(&block)
|
16
20
|
regexps.each do |r|
|
17
21
|
base.text.each_with_index do |w, i|
|
@@ -1,18 +1,22 @@
|
|
1
1
|
module Textoken
|
2
2
|
# This module will be shared in options like, more_than and less_than
|
3
3
|
module NumericOption
|
4
|
-
|
4
|
+
include TokenizableOption
|
5
|
+
|
6
|
+
attr_reader :number, :findings
|
5
7
|
|
6
8
|
def priority
|
7
9
|
2
|
8
10
|
end
|
9
11
|
|
10
12
|
def initialize(value)
|
11
|
-
validate_option_value(value)
|
12
13
|
@number = value
|
13
14
|
@findings = Findings.new
|
15
|
+
validate_option_value
|
14
16
|
end
|
15
17
|
|
18
|
+
private
|
19
|
+
|
16
20
|
def tokenize_if(&code)
|
17
21
|
base.text.each_with_index do |w, i|
|
18
22
|
findings.push(i, w) if code.call(w)
|
@@ -21,7 +25,7 @@ module Textoken
|
|
21
25
|
end
|
22
26
|
|
23
27
|
def validate(&code)
|
24
|
-
return if code.call
|
28
|
+
return if number.class == Fixnum && code.call(number)
|
25
29
|
Textoken.expression_err "value #{number} is not permitted for
|
26
30
|
#{self.class.name} option."
|
27
31
|
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Textoken
|
2
|
+
# This module will be shared in options like, only_regexp and exclude_regexp
|
3
|
+
module TokenizableOption
|
4
|
+
attr_reader :base
|
5
|
+
|
6
|
+
def tokenize(base)
|
7
|
+
@base = base
|
8
|
+
tokenize_condition
|
9
|
+
end
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def tokenize_condition
|
14
|
+
Textoken.type_err('tokenize_condition method has to be implemented
|
15
|
+
for Options.')
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -4,15 +4,14 @@ module Textoken
|
|
4
4
|
class MoreThan
|
5
5
|
include NumericOption
|
6
6
|
|
7
|
-
|
8
|
-
|
7
|
+
private
|
8
|
+
|
9
|
+
def tokenize_condition
|
9
10
|
tokenize_if { |word| word.length > number }
|
10
11
|
end
|
11
12
|
|
12
|
-
|
13
|
-
|
14
|
-
def validate_option_value(value)
|
15
|
-
validate { value.class == Fixnum && value >= 0 }
|
13
|
+
def validate_option_value
|
14
|
+
validate { |value| value >= 0 }
|
16
15
|
end
|
17
16
|
end
|
18
17
|
end
|
@@ -4,11 +4,12 @@ module Textoken
|
|
4
4
|
class Only
|
5
5
|
include ConditionalOption
|
6
6
|
|
7
|
+
private
|
8
|
+
|
7
9
|
# base.text is raw tokens splitted with ' '
|
8
10
|
# values are Regexps array to search
|
9
11
|
# base.findings, Findings object for pushing matching tokens
|
10
|
-
def
|
11
|
-
@base = base
|
12
|
+
def tokenize_condition
|
12
13
|
tokenize_if { |word, regexp| word.match(regexp) }
|
13
14
|
end
|
14
15
|
end
|
data/lib/textoken/version.rb
CHANGED
data/lib/textoken.rb
CHANGED
@@ -9,6 +9,7 @@ require 'textoken/findings'
|
|
9
9
|
require 'textoken/tokenizer'
|
10
10
|
require 'textoken/scanner'
|
11
11
|
|
12
|
+
require 'textoken/options/modules/tokenizable_option'
|
12
13
|
require 'textoken/options/modules/numeric_option'
|
13
14
|
require 'textoken/options/modules/conditional_option'
|
14
15
|
require 'textoken/options/modules/regexp_option'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textoken
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mehmet Cetin
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-10-
|
11
|
+
date: 2015-10-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -44,23 +44,9 @@ dependencies:
|
|
44
44
|
- - "~>"
|
45
45
|
- !ruby/object:Gem::Version
|
46
46
|
version: '10.0'
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
requirements:
|
51
|
-
- - "~>"
|
52
|
-
- !ruby/object:Gem::Version
|
53
|
-
version: '0'
|
54
|
-
type: :development
|
55
|
-
prerelease: false
|
56
|
-
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
requirements:
|
58
|
-
- - "~>"
|
59
|
-
- !ruby/object:Gem::Version
|
60
|
-
version: '0'
|
61
|
-
description: "Textoken is a Ruby library for text tokenization. \n This gem extracts
|
62
|
-
words from text with many customizations. \n It can be used in many fields like
|
63
|
-
crawling and Natural Language Processing."
|
47
|
+
description: Textoken is a Ruby library for text tokenization. This gem extracts words
|
48
|
+
from text with many customizations. It can be used in many fields like Web Crawling
|
49
|
+
and Natural Language Processing.
|
64
50
|
email:
|
65
51
|
- mcetin.cm@gmail.com
|
66
52
|
executables: []
|
@@ -80,6 +66,7 @@ files:
|
|
80
66
|
- lib/textoken/options/modules/conditional_option.rb
|
81
67
|
- lib/textoken/options/modules/numeric_option.rb
|
82
68
|
- lib/textoken/options/modules/regexp_option.rb
|
69
|
+
- lib/textoken/options/modules/tokenizable_option.rb
|
83
70
|
- lib/textoken/options/more_than.rb
|
84
71
|
- lib/textoken/options/only.rb
|
85
72
|
- lib/textoken/options/only_regexp.rb
|
@@ -108,7 +95,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
108
95
|
version: '0'
|
109
96
|
requirements: []
|
110
97
|
rubyforge_project:
|
111
|
-
rubygems_version: 2.4.
|
98
|
+
rubygems_version: 2.4.8
|
112
99
|
signing_key:
|
113
100
|
specification_version: 4
|
114
101
|
summary: Simple and customizable text tokenization gem.
|