textoken 1.1.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/textoken/options/exclude.rb +3 -2
- data/lib/textoken/options/less_than.rb +5 -6
- data/lib/textoken/options/modules/conditional_option.rb +4 -0
- data/lib/textoken/options/modules/numeric_option.rb +7 -3
- data/lib/textoken/options/modules/tokenizable_option.rb +18 -0
- data/lib/textoken/options/more_than.rb +5 -6
- data/lib/textoken/options/only.rb +3 -2
- data/lib/textoken/version.rb +1 -1
- data/lib/textoken.rb +1 -0
- metadata +7 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e530e61af20068c0e361856680b8df0ce9ace2e9
|
4
|
+
data.tar.gz: a94c443c5db50d0270916e8966c212e0321e8579
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 923e3261959c09b20f497f6b5d24a0495220265a23e23cd583f55db7d4eba65f5fe441374d4d11d56fc4a3e5856bb5baeda83c3c0f4994b522659121e207bd4e
|
7
|
+
data.tar.gz: 6ccac85435c7192ee2c83bfbc49d2efcff4e8fb695a4e78af5e35d7083ccc4ac4d13930ab9766d906fe41b98f4a50552a9a2aad62fe76a8f66e7cd925bfb6603
|
@@ -4,11 +4,12 @@ module Textoken
|
|
4
4
|
class Exclude
|
5
5
|
include ConditionalOption
|
6
6
|
|
7
|
+
private
|
8
|
+
|
7
9
|
# base.text is raw tokens splitted with ' '
|
8
10
|
# values are Regexps array to search
|
9
11
|
# base.findings, Findings object for pushing matching tokens
|
10
|
-
def
|
11
|
-
@base = base
|
12
|
+
def tokenize_condition
|
12
13
|
tokenize_if { |word, regexp| !word.match(regexp) }
|
13
14
|
end
|
14
15
|
end
|
@@ -4,15 +4,14 @@ module Textoken
|
|
4
4
|
class LessThan
|
5
5
|
include NumericOption
|
6
6
|
|
7
|
-
|
8
|
-
|
7
|
+
private
|
8
|
+
|
9
|
+
def tokenize_condition
|
9
10
|
tokenize_if { |word| word.length < number }
|
10
11
|
end
|
11
12
|
|
12
|
-
|
13
|
-
|
14
|
-
def validate_option_value(value)
|
15
|
-
validate { value.class == Fixnum && value > 1 }
|
13
|
+
def validate_option_value
|
14
|
+
validate { |value| value > 1 }
|
16
15
|
end
|
17
16
|
end
|
18
17
|
end
|
@@ -1,6 +1,8 @@
|
|
1
1
|
module Textoken
|
2
2
|
# This module will be shared in options like, only and exclude
|
3
3
|
module ConditionalOption
|
4
|
+
include TokenizableOption
|
5
|
+
|
4
6
|
attr_reader :regexps, :findings, :base
|
5
7
|
|
6
8
|
def priority
|
@@ -12,6 +14,8 @@ module Textoken
|
|
12
14
|
@findings = Findings.new
|
13
15
|
end
|
14
16
|
|
17
|
+
private
|
18
|
+
|
15
19
|
def tokenize_if(&block)
|
16
20
|
regexps.each do |r|
|
17
21
|
base.text.each_with_index do |w, i|
|
@@ -1,18 +1,22 @@
|
|
1
1
|
module Textoken
|
2
2
|
# This module will be shared in options like, more_than and less_than
|
3
3
|
module NumericOption
|
4
|
-
|
4
|
+
include TokenizableOption
|
5
|
+
|
6
|
+
attr_reader :number, :findings
|
5
7
|
|
6
8
|
def priority
|
7
9
|
2
|
8
10
|
end
|
9
11
|
|
10
12
|
def initialize(value)
|
11
|
-
validate_option_value(value)
|
12
13
|
@number = value
|
13
14
|
@findings = Findings.new
|
15
|
+
validate_option_value
|
14
16
|
end
|
15
17
|
|
18
|
+
private
|
19
|
+
|
16
20
|
def tokenize_if(&code)
|
17
21
|
base.text.each_with_index do |w, i|
|
18
22
|
findings.push(i, w) if code.call(w)
|
@@ -21,7 +25,7 @@ module Textoken
|
|
21
25
|
end
|
22
26
|
|
23
27
|
def validate(&code)
|
24
|
-
return if code.call
|
28
|
+
return if number.class == Fixnum && code.call(number)
|
25
29
|
Textoken.expression_err "value #{number} is not permitted for
|
26
30
|
#{self.class.name} option."
|
27
31
|
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Textoken
|
2
|
+
# This module will be shared in options like, only_regexp and exclude_regexp
|
3
|
+
module TokenizableOption
|
4
|
+
attr_reader :base
|
5
|
+
|
6
|
+
def tokenize(base)
|
7
|
+
@base = base
|
8
|
+
tokenize_condition
|
9
|
+
end
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def tokenize_condition
|
14
|
+
Textoken.type_err('tokenize_condition method has to be implemented
|
15
|
+
for Options.')
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -4,15 +4,14 @@ module Textoken
|
|
4
4
|
class MoreThan
|
5
5
|
include NumericOption
|
6
6
|
|
7
|
-
|
8
|
-
|
7
|
+
private
|
8
|
+
|
9
|
+
def tokenize_condition
|
9
10
|
tokenize_if { |word| word.length > number }
|
10
11
|
end
|
11
12
|
|
12
|
-
|
13
|
-
|
14
|
-
def validate_option_value(value)
|
15
|
-
validate { value.class == Fixnum && value >= 0 }
|
13
|
+
def validate_option_value
|
14
|
+
validate { |value| value >= 0 }
|
16
15
|
end
|
17
16
|
end
|
18
17
|
end
|
@@ -4,11 +4,12 @@ module Textoken
|
|
4
4
|
class Only
|
5
5
|
include ConditionalOption
|
6
6
|
|
7
|
+
private
|
8
|
+
|
7
9
|
# base.text is raw tokens splitted with ' '
|
8
10
|
# values are Regexps array to search
|
9
11
|
# base.findings, Findings object for pushing matching tokens
|
10
|
-
def
|
11
|
-
@base = base
|
12
|
+
def tokenize_condition
|
12
13
|
tokenize_if { |word, regexp| word.match(regexp) }
|
13
14
|
end
|
14
15
|
end
|
data/lib/textoken/version.rb
CHANGED
data/lib/textoken.rb
CHANGED
@@ -9,6 +9,7 @@ require 'textoken/findings'
|
|
9
9
|
require 'textoken/tokenizer'
|
10
10
|
require 'textoken/scanner'
|
11
11
|
|
12
|
+
require 'textoken/options/modules/tokenizable_option'
|
12
13
|
require 'textoken/options/modules/numeric_option'
|
13
14
|
require 'textoken/options/modules/conditional_option'
|
14
15
|
require 'textoken/options/modules/regexp_option'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textoken
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mehmet Cetin
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-10-
|
11
|
+
date: 2015-10-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -44,23 +44,9 @@ dependencies:
|
|
44
44
|
- - "~>"
|
45
45
|
- !ruby/object:Gem::Version
|
46
46
|
version: '10.0'
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
requirements:
|
51
|
-
- - "~>"
|
52
|
-
- !ruby/object:Gem::Version
|
53
|
-
version: '0'
|
54
|
-
type: :development
|
55
|
-
prerelease: false
|
56
|
-
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
requirements:
|
58
|
-
- - "~>"
|
59
|
-
- !ruby/object:Gem::Version
|
60
|
-
version: '0'
|
61
|
-
description: "Textoken is a Ruby library for text tokenization. \n This gem extracts
|
62
|
-
words from text with many customizations. \n It can be used in many fields like
|
63
|
-
crawling and Natural Language Processing."
|
47
|
+
description: Textoken is a Ruby library for text tokenization. This gem extracts words
|
48
|
+
from text with many customizations. It can be used in many fields like Web Crawling
|
49
|
+
and Natural Language Processing.
|
64
50
|
email:
|
65
51
|
- mcetin.cm@gmail.com
|
66
52
|
executables: []
|
@@ -80,6 +66,7 @@ files:
|
|
80
66
|
- lib/textoken/options/modules/conditional_option.rb
|
81
67
|
- lib/textoken/options/modules/numeric_option.rb
|
82
68
|
- lib/textoken/options/modules/regexp_option.rb
|
69
|
+
- lib/textoken/options/modules/tokenizable_option.rb
|
83
70
|
- lib/textoken/options/more_than.rb
|
84
71
|
- lib/textoken/options/only.rb
|
85
72
|
- lib/textoken/options/only_regexp.rb
|
@@ -108,7 +95,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
108
95
|
version: '0'
|
109
96
|
requirements: []
|
110
97
|
rubyforge_project:
|
111
|
-
rubygems_version: 2.4.
|
98
|
+
rubygems_version: 2.4.8
|
112
99
|
signing_key:
|
113
100
|
specification_version: 4
|
114
101
|
summary: Simple and customizable text tokenization gem.
|