textoken 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Rakefile +9 -0
- data/lib/textoken/base.rb +39 -0
- data/lib/textoken/errors.rb +7 -0
- data/lib/textoken/factories/option_factory.rb +23 -0
- data/lib/textoken/findings.rb +34 -0
- data/lib/textoken/options/exclude.rb +28 -0
- data/lib/textoken/options/less_than.rb +32 -0
- data/lib/textoken/options/more_than.rb +32 -0
- data/lib/textoken/options/only.rb +28 -0
- data/lib/textoken/options.rb +36 -0
- data/lib/textoken/regexps/option_values.yml +22 -0
- data/lib/textoken/scanner.rb +33 -0
- data/lib/textoken/searcher.rb +45 -0
- data/lib/textoken/tokenizer.rb +33 -0
- data/lib/textoken/version.rb +3 -0
- data/lib/textoken.rb +42 -0
- metadata +110 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: cf70d771ada2f8b5edaec1b2f6b4c97a4efa563a
|
4
|
+
data.tar.gz: eead0c387504164a302e687fa5858b1bf270d972
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 13f3ff9bd02f7b57954e791ea0c068c3c1b529d7056a7067fa66100c0992e1299cfa0e897a7b27522fa557dfbb6290b2191bf8654a366df38ceb209a30ae9c7f
|
7
|
+
data.tar.gz: d691c3914c7cb78dd396d80d9b8524e2d5e1b3e2786a8518da1558c13916937b59692f29b7723e963aa935c5e42e764c608e9c80d5d7e1b705a1d331e3dfd2c7
|
data/Rakefile
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
module Textoken
|
2
|
+
# Inits options, findings and responds to tokens
|
3
|
+
# Does not raise error when text or options are nil
|
4
|
+
# Splits the text and makes it ready for other operations
|
5
|
+
class Base
|
6
|
+
attr_reader :text, :dont_split, :findings, :options
|
7
|
+
|
8
|
+
def initialize(text, opt = nil)
|
9
|
+
@text = initial_split(text)
|
10
|
+
@options = Options.new(opt)
|
11
|
+
end
|
12
|
+
|
13
|
+
# we do take intersection array of results
|
14
|
+
# returning from multiple options
|
15
|
+
def tokens
|
16
|
+
options.collection.each do |option|
|
17
|
+
if @findings.nil?
|
18
|
+
@findings = option.tokenize(self)
|
19
|
+
else
|
20
|
+
@findings &= option.tokenize(self)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
Tokenizer.new(self).tokens
|
25
|
+
end
|
26
|
+
|
27
|
+
def words
|
28
|
+
# tokenize options but do not make the last split
|
29
|
+
@dont_split = true
|
30
|
+
tokens
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def initial_split(text)
|
36
|
+
text ? text.split(' ') : []
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Textoken
|
2
|
+
# This factory created option objects throug user input option
|
3
|
+
# User input option like 'more_than: 3' (more_than) gets camelized
|
4
|
+
# and objects get initialized with error handling
|
5
|
+
module OptionFactory
|
6
|
+
def self.build(key, value)
|
7
|
+
option_klass(key).new(value)
|
8
|
+
end
|
9
|
+
|
10
|
+
private
|
11
|
+
|
12
|
+
def self.camelize(key)
|
13
|
+
key.to_s.split('_').map(&:capitalize).join
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.option_klass(key)
|
17
|
+
# add a module to make other classes unaccessible
|
18
|
+
Textoken.const_get(camelize(key).to_sym)
|
19
|
+
rescue NameError
|
20
|
+
Textoken.expression_err("#{key}: is not a valid option.")
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Textoken
|
2
|
+
# This class duty is to collect splitted words with array index no
|
3
|
+
# And also handles making uniq of multiple words in the same index
|
4
|
+
# due to Regexps that reveal same/closer results
|
5
|
+
class Findings
|
6
|
+
def initialize
|
7
|
+
@collection = []
|
8
|
+
end
|
9
|
+
|
10
|
+
# Here we will push items to collection array with index number
|
11
|
+
# index number will help us to sort and make array unique
|
12
|
+
def push(index, word)
|
13
|
+
type_check(index, word)
|
14
|
+
@collection << [index, word]
|
15
|
+
end
|
16
|
+
|
17
|
+
# collection will return a sorted and unique array of tokens
|
18
|
+
def collection
|
19
|
+
@collection.uniq { |w| w[0].to_s + w[1] }.sort_by(&:first)
|
20
|
+
end
|
21
|
+
|
22
|
+
# result will return a one dimensional array of words
|
23
|
+
def result
|
24
|
+
collection.map(&:last)
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def type_check(i, word)
|
30
|
+
return if word.is_a?(String) && (i.is_a?(Fixnum) || i.is_a?(Float))
|
31
|
+
Textoken.type_err("#{word} and #{i} has to be a String and Integer")
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Textoken
|
2
|
+
# This option object excludes words in text via matching regexp
|
3
|
+
# Non-excluded words pushed to findings array
|
4
|
+
class Exclude
|
5
|
+
attr_reader :regexps, :findings
|
6
|
+
|
7
|
+
def priority
|
8
|
+
1
|
9
|
+
end
|
10
|
+
|
11
|
+
def initialize(values)
|
12
|
+
@regexps = Searcher.new(values).regexps
|
13
|
+
@findings = Findings.new
|
14
|
+
end
|
15
|
+
|
16
|
+
# base.text is raw tokens splitted with ' '
|
17
|
+
# values are Regexps array to search
|
18
|
+
# base.findings, Findings object for pushing matching tokens
|
19
|
+
def tokenize(base)
|
20
|
+
regexps.each do |r|
|
21
|
+
base.text.each_with_index do |t, i|
|
22
|
+
findings.push(i, t) unless t.match(r)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
findings.result
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Textoken
|
2
|
+
# This option object picks words in text with less than length
|
3
|
+
# of the option value
|
4
|
+
class LessThan
|
5
|
+
attr_reader :number, :findings
|
6
|
+
|
7
|
+
def priority
|
8
|
+
2
|
9
|
+
end
|
10
|
+
|
11
|
+
def initialize(value)
|
12
|
+
check_value(value)
|
13
|
+
@number = value
|
14
|
+
@findings = Findings.new
|
15
|
+
end
|
16
|
+
|
17
|
+
def tokenize(base)
|
18
|
+
base.text.each_with_index do |w, i|
|
19
|
+
findings.push(i, w) if w.length < number
|
20
|
+
end
|
21
|
+
findings.result
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def check_value(value)
|
27
|
+
return if value.class == Fixnum && value > 1
|
28
|
+
Textoken.type_err "value #{value} is not permitted for
|
29
|
+
less_than option it has to be 2 at least."
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Textoken
|
2
|
+
# This option object picks words in text with more than length
|
3
|
+
# of the option value
|
4
|
+
class MoreThan
|
5
|
+
attr_reader :number, :findings
|
6
|
+
|
7
|
+
def priority
|
8
|
+
2
|
9
|
+
end
|
10
|
+
|
11
|
+
def initialize(value)
|
12
|
+
check_value(value)
|
13
|
+
@number = value
|
14
|
+
@findings = Findings.new
|
15
|
+
end
|
16
|
+
|
17
|
+
def tokenize(base)
|
18
|
+
base.text.each_with_index do |w, i|
|
19
|
+
findings.push(i, w) if w.length > number
|
20
|
+
end
|
21
|
+
findings.result
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def check_value(value)
|
27
|
+
return if value.class == Fixnum && value >= 0
|
28
|
+
Textoken.type_err "value #{value} is not permitted for
|
29
|
+
more_than option it has to be 0 at least."
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Textoken
|
2
|
+
# This option object selects words in text via matching regexp
|
3
|
+
# regexp should be defined in option_values.yml
|
4
|
+
class Only
|
5
|
+
attr_reader :regexps, :findings
|
6
|
+
|
7
|
+
def priority
|
8
|
+
1
|
9
|
+
end
|
10
|
+
|
11
|
+
def initialize(values)
|
12
|
+
@regexps = Searcher.new(values).regexps
|
13
|
+
@findings = Findings.new
|
14
|
+
end
|
15
|
+
|
16
|
+
# base.text is raw tokens splitted with ' '
|
17
|
+
# values are Regexps array to search
|
18
|
+
# base.findings, Findings object for pushing matching tokens
|
19
|
+
def tokenize(base)
|
20
|
+
regexps.each do |r|
|
21
|
+
base.text.each_with_index do |t, i|
|
22
|
+
findings.push(i, t) if t.match(r)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
findings.result
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module Textoken
|
2
|
+
# Creates collection array, checks basic hash format.
|
3
|
+
# Responds to collection as an array of options sorted by
|
4
|
+
# priority. Does not raise an exception when options are nil
|
5
|
+
class Options
|
6
|
+
attr_reader :options
|
7
|
+
|
8
|
+
def initialize(opt)
|
9
|
+
@options = opt
|
10
|
+
@collection = []
|
11
|
+
check_options_format
|
12
|
+
end
|
13
|
+
|
14
|
+
def collection
|
15
|
+
options.each { |k, v| init_option(k, v) } if options
|
16
|
+
sort_collection
|
17
|
+
@collection
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def check_options_format
|
23
|
+
return if options.nil? || options.is_a?(Hash)
|
24
|
+
Textoken.expression_err("#{options} is not a valid format, you can use;
|
25
|
+
Textoken('Alfa beta.', exclude: 'dates, phones', more_than: 3)")
|
26
|
+
end
|
27
|
+
|
28
|
+
def init_option(key, value)
|
29
|
+
@collection << Textoken::OptionFactory.build(key, value)
|
30
|
+
end
|
31
|
+
|
32
|
+
def sort_collection
|
33
|
+
@collection.sort! { |a, b| a.priority <=> b.priority }
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# This file stands for options you can pass as an argument like,
|
2
|
+
# only: 'dates, numbers, etc..' or exclude: 'dates'
|
3
|
+
|
4
|
+
# You can add many regexps under one type, all of them will be searchable
|
5
|
+
|
6
|
+
punctuations:
|
7
|
+
# punctuations
|
8
|
+
- \W+
|
9
|
+
|
10
|
+
numerics:
|
11
|
+
- (?:\d*\.)?\d+
|
12
|
+
- (?:\d*\,)?\d+
|
13
|
+
|
14
|
+
phones:
|
15
|
+
# covers p:444-555-1234 f:246.555.8888 m:1235554567
|
16
|
+
- \b\d{3}[-.]?\d{3}[-.]?\d{4}\b
|
17
|
+
|
18
|
+
dates:
|
19
|
+
# mm/dd/yyyy format
|
20
|
+
- ^(0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])[- /.](19|20)\d\d$
|
21
|
+
# dd-mm-yyyy format
|
22
|
+
- ^(0[1-9]|[12][0-9]|3[01])[- /.](0[1-9]|1[012])[- /.](19|20)\d\d$
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Textoken
|
2
|
+
# Scanner finds tokens in a word with regexp
|
3
|
+
# If word does not match regexp returns an empty Array
|
4
|
+
class Scanner
|
5
|
+
attr_reader :word, :regexp
|
6
|
+
|
7
|
+
def initialize(word, regexp)
|
8
|
+
@word = word
|
9
|
+
@regexp = regexp
|
10
|
+
check_types
|
11
|
+
end
|
12
|
+
|
13
|
+
def result
|
14
|
+
scan = word.scan(regexp)
|
15
|
+
scan.length > 0 ? partition(scan, word) : nil
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def partition(scan, word)
|
21
|
+
scan.each do |p|
|
22
|
+
word = word.gsub(p, ' ' + p + ' ')
|
23
|
+
end
|
24
|
+
word.split(' ')
|
25
|
+
end
|
26
|
+
|
27
|
+
def check_types
|
28
|
+
return if word.is_a?(String) && regexp.is_a?(Regexp)
|
29
|
+
Textoken.type_err("#{regexp} should be Regexp and #{word}
|
30
|
+
has to be a string.")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Textoken
|
2
|
+
# This class reads YAML file under searchables/values,
|
3
|
+
# returns array of regexps representing values passed as an argument
|
4
|
+
# todo: add regexp support
|
5
|
+
class Searcher
|
6
|
+
attr_reader :values, :yaml
|
7
|
+
|
8
|
+
def initialize(values)
|
9
|
+
@values = check_and_init_values(values)
|
10
|
+
@yaml = load_file
|
11
|
+
@regexps = []
|
12
|
+
end
|
13
|
+
|
14
|
+
def regexps
|
15
|
+
match_keys
|
16
|
+
@regexps.map.each { |r| Regexp.new(r) }
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def check_and_init_values(values)
|
22
|
+
values.split(',').map(&:strip)
|
23
|
+
rescue
|
24
|
+
Textoken.expression_err("#{values} are not supported. Correct format,
|
25
|
+
has to be 'numbers' or 'numbers, dates, phones'")
|
26
|
+
end
|
27
|
+
|
28
|
+
def load_file
|
29
|
+
YAML.load_file("#{GEM_ROOT}/lib/textoken/regexps/option_values.yml")
|
30
|
+
end
|
31
|
+
|
32
|
+
# here we do check for option values user supplied
|
33
|
+
# option values has to be declared at option_values.yml
|
34
|
+
def match_keys
|
35
|
+
values.each do |v|
|
36
|
+
Textoken.expression_err("#{v}: is not permitted.") unless yaml.key?(v)
|
37
|
+
add_regexps(yaml[v])
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def add_regexps(arr)
|
42
|
+
@regexps += arr
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Textoken
|
2
|
+
# This is the last step in the process
|
3
|
+
# Takes findings objects and makes finishing punctuation split
|
4
|
+
# Does not split if base.dont_split is true
|
5
|
+
class Tokenizer
|
6
|
+
attr_reader :base, :result, :col
|
7
|
+
|
8
|
+
def initialize(base)
|
9
|
+
@base = base
|
10
|
+
@result = []
|
11
|
+
@col = base.options.collection.length > 0 ? base.findings : base.text
|
12
|
+
end
|
13
|
+
|
14
|
+
def tokens
|
15
|
+
return col if base.dont_split
|
16
|
+
split_punctuations
|
17
|
+
@result
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def split_punctuations
|
23
|
+
col.each do |w|
|
24
|
+
@result += Scanner.new(w, default_regexp).result || [w]
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# will be used for finding punctuations
|
29
|
+
def default_regexp
|
30
|
+
Regexp.new('\W+')
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/lib/textoken.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
|
3
|
+
require 'textoken/version'
|
4
|
+
require 'textoken/base'
|
5
|
+
require 'textoken/errors'
|
6
|
+
require 'textoken/searcher'
|
7
|
+
require 'textoken/options'
|
8
|
+
require 'textoken/findings'
|
9
|
+
require 'textoken/tokenizer'
|
10
|
+
require 'textoken/scanner'
|
11
|
+
|
12
|
+
require 'textoken/options/less_than'
|
13
|
+
require 'textoken/options/more_than'
|
14
|
+
require 'textoken/options/only'
|
15
|
+
require 'textoken/options/exclude'
|
16
|
+
|
17
|
+
require 'textoken/factories/option_factory'
|
18
|
+
|
19
|
+
# Textoken is a library for customizable tokenization of texts.
|
20
|
+
# Customizable tokenization can be used in many areas including NLP purposes
|
21
|
+
module Textoken
|
22
|
+
GEM_ROOT = File.expand_path('../..', __FILE__)
|
23
|
+
|
24
|
+
class << self
|
25
|
+
# Expression error raised in wrong user input of options & values
|
26
|
+
def expression_err(msg)
|
27
|
+
fail ExpressionError, msg
|
28
|
+
end
|
29
|
+
|
30
|
+
# Type error raised when user input format is right but the option value
|
31
|
+
# is not suitable for option
|
32
|
+
def type_err(msg)
|
33
|
+
fail TypeError, msg
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# A shortcut to initialize gem
|
39
|
+
# Textoken('I had rather be first in a village than second at Rome.')
|
40
|
+
def Textoken(text, options = nil)
|
41
|
+
Textoken::Base.new(text, options)
|
42
|
+
end
|
metadata
ADDED
@@ -0,0 +1,110 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: textoken
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Mehmet Cetin
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-10-03 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rspec
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 3.3.0
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 3.3.0
|
23
|
+
type: :development
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
requirements:
|
27
|
+
- - "~>"
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 3.3.0
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 3.3.0
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: rake
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - "~>"
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '10.0'
|
40
|
+
type: :development
|
41
|
+
prerelease: false
|
42
|
+
version_requirements: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - "~>"
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '10.0'
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: pry
|
49
|
+
requirement: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - "~>"
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - "~>"
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '0'
|
61
|
+
description: "Textoken is a Ruby library for text tokenization. \n This gem extracts
|
62
|
+
words from text with many customizations. \n It can be used in many fields like
|
63
|
+
crawling and Natural Language Processing."
|
64
|
+
email:
|
65
|
+
- mcetin.cm@gmail.com
|
66
|
+
executables: []
|
67
|
+
extensions: []
|
68
|
+
extra_rdoc_files: []
|
69
|
+
files:
|
70
|
+
- Rakefile
|
71
|
+
- lib/textoken.rb
|
72
|
+
- lib/textoken/base.rb
|
73
|
+
- lib/textoken/errors.rb
|
74
|
+
- lib/textoken/factories/option_factory.rb
|
75
|
+
- lib/textoken/findings.rb
|
76
|
+
- lib/textoken/options.rb
|
77
|
+
- lib/textoken/options/exclude.rb
|
78
|
+
- lib/textoken/options/less_than.rb
|
79
|
+
- lib/textoken/options/more_than.rb
|
80
|
+
- lib/textoken/options/only.rb
|
81
|
+
- lib/textoken/regexps/option_values.yml
|
82
|
+
- lib/textoken/scanner.rb
|
83
|
+
- lib/textoken/searcher.rb
|
84
|
+
- lib/textoken/tokenizer.rb
|
85
|
+
- lib/textoken/version.rb
|
86
|
+
homepage: https://github.com/manorie/textoken
|
87
|
+
licenses:
|
88
|
+
- MIT
|
89
|
+
metadata: {}
|
90
|
+
post_install_message:
|
91
|
+
rdoc_options: []
|
92
|
+
require_paths:
|
93
|
+
- lib
|
94
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
95
|
+
requirements:
|
96
|
+
- - ">="
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: '0'
|
99
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
requirements: []
|
105
|
+
rubyforge_project:
|
106
|
+
rubygems_version: 2.4.5.1
|
107
|
+
signing_key:
|
108
|
+
specification_version: 4
|
109
|
+
summary: Simple and customizable text tokenization gem.
|
110
|
+
test_files: []
|