textoken 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Rakefile +9 -0
- data/lib/textoken/base.rb +39 -0
- data/lib/textoken/errors.rb +7 -0
- data/lib/textoken/factories/option_factory.rb +23 -0
- data/lib/textoken/findings.rb +34 -0
- data/lib/textoken/options/exclude.rb +28 -0
- data/lib/textoken/options/less_than.rb +32 -0
- data/lib/textoken/options/more_than.rb +32 -0
- data/lib/textoken/options/only.rb +28 -0
- data/lib/textoken/options.rb +36 -0
- data/lib/textoken/regexps/option_values.yml +22 -0
- data/lib/textoken/scanner.rb +33 -0
- data/lib/textoken/searcher.rb +45 -0
- data/lib/textoken/tokenizer.rb +33 -0
- data/lib/textoken/version.rb +3 -0
- data/lib/textoken.rb +42 -0
- metadata +110 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: cf70d771ada2f8b5edaec1b2f6b4c97a4efa563a
|
4
|
+
data.tar.gz: eead0c387504164a302e687fa5858b1bf270d972
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 13f3ff9bd02f7b57954e791ea0c068c3c1b529d7056a7067fa66100c0992e1299cfa0e897a7b27522fa557dfbb6290b2191bf8654a366df38ceb209a30ae9c7f
|
7
|
+
data.tar.gz: d691c3914c7cb78dd396d80d9b8524e2d5e1b3e2786a8518da1558c13916937b59692f29b7723e963aa935c5e42e764c608e9c80d5d7e1b705a1d331e3dfd2c7
|
data/Rakefile
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
module Textoken
|
2
|
+
# Inits options, findings and responds to tokens
|
3
|
+
# Does not raise error when text or options are nil
|
4
|
+
# Splits the text and makes it ready for other operations
|
5
|
+
class Base
|
6
|
+
attr_reader :text, :dont_split, :findings, :options
|
7
|
+
|
8
|
+
def initialize(text, opt = nil)
|
9
|
+
@text = initial_split(text)
|
10
|
+
@options = Options.new(opt)
|
11
|
+
end
|
12
|
+
|
13
|
+
# we do take intersection array of results
|
14
|
+
# returning from multiple options
|
15
|
+
def tokens
|
16
|
+
options.collection.each do |option|
|
17
|
+
if @findings.nil?
|
18
|
+
@findings = option.tokenize(self)
|
19
|
+
else
|
20
|
+
@findings &= option.tokenize(self)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
Tokenizer.new(self).tokens
|
25
|
+
end
|
26
|
+
|
27
|
+
def words
|
28
|
+
# tokenize options but do not make the last split
|
29
|
+
@dont_split = true
|
30
|
+
tokens
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def initial_split(text)
|
36
|
+
text ? text.split(' ') : []
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Textoken
|
2
|
+
# This factory created option objects throug user input option
|
3
|
+
# User input option like 'more_than: 3' (more_than) gets camelized
|
4
|
+
# and objects get initialized with error handling
|
5
|
+
module OptionFactory
|
6
|
+
def self.build(key, value)
|
7
|
+
option_klass(key).new(value)
|
8
|
+
end
|
9
|
+
|
10
|
+
private
|
11
|
+
|
12
|
+
def self.camelize(key)
|
13
|
+
key.to_s.split('_').map(&:capitalize).join
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.option_klass(key)
|
17
|
+
# add a module to make other classes unaccessible
|
18
|
+
Textoken.const_get(camelize(key).to_sym)
|
19
|
+
rescue NameError
|
20
|
+
Textoken.expression_err("#{key}: is not a valid option.")
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Textoken
|
2
|
+
# This class duty is to collect splitted words with array index no
|
3
|
+
# And also handles making uniq of multiple words in the same index
|
4
|
+
# due to Regexps that reveal same/closer results
|
5
|
+
class Findings
|
6
|
+
def initialize
|
7
|
+
@collection = []
|
8
|
+
end
|
9
|
+
|
10
|
+
# Here we will push items to collection array with index number
|
11
|
+
# index number will help us to sort and make array unique
|
12
|
+
def push(index, word)
|
13
|
+
type_check(index, word)
|
14
|
+
@collection << [index, word]
|
15
|
+
end
|
16
|
+
|
17
|
+
# collection will return a sorted and unique array of tokens
|
18
|
+
def collection
|
19
|
+
@collection.uniq { |w| w[0].to_s + w[1] }.sort_by(&:first)
|
20
|
+
end
|
21
|
+
|
22
|
+
# result will return a one dimensional array of words
|
23
|
+
def result
|
24
|
+
collection.map(&:last)
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def type_check(i, word)
|
30
|
+
return if word.is_a?(String) && (i.is_a?(Fixnum) || i.is_a?(Float))
|
31
|
+
Textoken.type_err("#{word} and #{i} has to be a String and Integer")
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Textoken
|
2
|
+
# This option object excludes words in text via matching regexp
|
3
|
+
# Non-excluded words pushed to findings array
|
4
|
+
class Exclude
|
5
|
+
attr_reader :regexps, :findings
|
6
|
+
|
7
|
+
def priority
|
8
|
+
1
|
9
|
+
end
|
10
|
+
|
11
|
+
def initialize(values)
|
12
|
+
@regexps = Searcher.new(values).regexps
|
13
|
+
@findings = Findings.new
|
14
|
+
end
|
15
|
+
|
16
|
+
# base.text is raw tokens splitted with ' '
|
17
|
+
# values are Regexps array to search
|
18
|
+
# base.findings, Findings object for pushing matching tokens
|
19
|
+
def tokenize(base)
|
20
|
+
regexps.each do |r|
|
21
|
+
base.text.each_with_index do |t, i|
|
22
|
+
findings.push(i, t) unless t.match(r)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
findings.result
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Textoken
|
2
|
+
# This option object picks words in text with less than length
|
3
|
+
# of the option value
|
4
|
+
class LessThan
|
5
|
+
attr_reader :number, :findings
|
6
|
+
|
7
|
+
def priority
|
8
|
+
2
|
9
|
+
end
|
10
|
+
|
11
|
+
def initialize(value)
|
12
|
+
check_value(value)
|
13
|
+
@number = value
|
14
|
+
@findings = Findings.new
|
15
|
+
end
|
16
|
+
|
17
|
+
def tokenize(base)
|
18
|
+
base.text.each_with_index do |w, i|
|
19
|
+
findings.push(i, w) if w.length < number
|
20
|
+
end
|
21
|
+
findings.result
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def check_value(value)
|
27
|
+
return if value.class == Fixnum && value > 1
|
28
|
+
Textoken.type_err "value #{value} is not permitted for
|
29
|
+
less_than option it has to be 2 at least."
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Textoken
|
2
|
+
# This option object picks words in text with more than length
|
3
|
+
# of the option value
|
4
|
+
class MoreThan
|
5
|
+
attr_reader :number, :findings
|
6
|
+
|
7
|
+
def priority
|
8
|
+
2
|
9
|
+
end
|
10
|
+
|
11
|
+
def initialize(value)
|
12
|
+
check_value(value)
|
13
|
+
@number = value
|
14
|
+
@findings = Findings.new
|
15
|
+
end
|
16
|
+
|
17
|
+
def tokenize(base)
|
18
|
+
base.text.each_with_index do |w, i|
|
19
|
+
findings.push(i, w) if w.length > number
|
20
|
+
end
|
21
|
+
findings.result
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def check_value(value)
|
27
|
+
return if value.class == Fixnum && value >= 0
|
28
|
+
Textoken.type_err "value #{value} is not permitted for
|
29
|
+
more_than option it has to be 0 at least."
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Textoken
|
2
|
+
# This option object selects words in text via matching regexp
|
3
|
+
# regexp should be defined in option_values.yml
|
4
|
+
class Only
|
5
|
+
attr_reader :regexps, :findings
|
6
|
+
|
7
|
+
def priority
|
8
|
+
1
|
9
|
+
end
|
10
|
+
|
11
|
+
def initialize(values)
|
12
|
+
@regexps = Searcher.new(values).regexps
|
13
|
+
@findings = Findings.new
|
14
|
+
end
|
15
|
+
|
16
|
+
# base.text is raw tokens splitted with ' '
|
17
|
+
# values are Regexps array to search
|
18
|
+
# base.findings, Findings object for pushing matching tokens
|
19
|
+
def tokenize(base)
|
20
|
+
regexps.each do |r|
|
21
|
+
base.text.each_with_index do |t, i|
|
22
|
+
findings.push(i, t) if t.match(r)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
findings.result
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module Textoken
|
2
|
+
# Creates collection array, checks basic hash format.
|
3
|
+
# Responds to collection as an array of options sorted by
|
4
|
+
# priority. Does not raise an exception when options are nil
|
5
|
+
class Options
|
6
|
+
attr_reader :options
|
7
|
+
|
8
|
+
def initialize(opt)
|
9
|
+
@options = opt
|
10
|
+
@collection = []
|
11
|
+
check_options_format
|
12
|
+
end
|
13
|
+
|
14
|
+
def collection
|
15
|
+
options.each { |k, v| init_option(k, v) } if options
|
16
|
+
sort_collection
|
17
|
+
@collection
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def check_options_format
|
23
|
+
return if options.nil? || options.is_a?(Hash)
|
24
|
+
Textoken.expression_err("#{options} is not a valid format, you can use;
|
25
|
+
Textoken('Alfa beta.', exclude: 'dates, phones', more_than: 3)")
|
26
|
+
end
|
27
|
+
|
28
|
+
def init_option(key, value)
|
29
|
+
@collection << Textoken::OptionFactory.build(key, value)
|
30
|
+
end
|
31
|
+
|
32
|
+
def sort_collection
|
33
|
+
@collection.sort! { |a, b| a.priority <=> b.priority }
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# This file stands for options you can pass as an argument like,
|
2
|
+
# only: 'dates, numbers, etc..' or exclude: 'dates'
|
3
|
+
|
4
|
+
# You can add many regexps under one type, all of them will be searchable
|
5
|
+
|
6
|
+
punctuations:
|
7
|
+
# punctuations
|
8
|
+
- \W+
|
9
|
+
|
10
|
+
numerics:
|
11
|
+
- (?:\d*\.)?\d+
|
12
|
+
- (?:\d*\,)?\d+
|
13
|
+
|
14
|
+
phones:
|
15
|
+
# covers p:444-555-1234 f:246.555.8888 m:1235554567
|
16
|
+
- \b\d{3}[-.]?\d{3}[-.]?\d{4}\b
|
17
|
+
|
18
|
+
dates:
|
19
|
+
# mm/dd/yyyy format
|
20
|
+
- ^(0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])[- /.](19|20)\d\d$
|
21
|
+
# dd-mm-yyyy format
|
22
|
+
- ^(0[1-9]|[12][0-9]|3[01])[- /.](0[1-9]|1[012])[- /.](19|20)\d\d$
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Textoken
|
2
|
+
# Scanner finds tokens in a word with regexp
|
3
|
+
# If word does not match regexp returns an empty Array
|
4
|
+
class Scanner
|
5
|
+
attr_reader :word, :regexp
|
6
|
+
|
7
|
+
def initialize(word, regexp)
|
8
|
+
@word = word
|
9
|
+
@regexp = regexp
|
10
|
+
check_types
|
11
|
+
end
|
12
|
+
|
13
|
+
def result
|
14
|
+
scan = word.scan(regexp)
|
15
|
+
scan.length > 0 ? partition(scan, word) : nil
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def partition(scan, word)
|
21
|
+
scan.each do |p|
|
22
|
+
word = word.gsub(p, ' ' + p + ' ')
|
23
|
+
end
|
24
|
+
word.split(' ')
|
25
|
+
end
|
26
|
+
|
27
|
+
def check_types
|
28
|
+
return if word.is_a?(String) && regexp.is_a?(Regexp)
|
29
|
+
Textoken.type_err("#{regexp} should be Regexp and #{word}
|
30
|
+
has to be a string.")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Textoken
|
2
|
+
# This class reads YAML file under searchables/values,
|
3
|
+
# returns array of regexps representing values passed as an argument
|
4
|
+
# todo: add regexp support
|
5
|
+
class Searcher
|
6
|
+
attr_reader :values, :yaml
|
7
|
+
|
8
|
+
def initialize(values)
|
9
|
+
@values = check_and_init_values(values)
|
10
|
+
@yaml = load_file
|
11
|
+
@regexps = []
|
12
|
+
end
|
13
|
+
|
14
|
+
def regexps
|
15
|
+
match_keys
|
16
|
+
@regexps.map.each { |r| Regexp.new(r) }
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def check_and_init_values(values)
|
22
|
+
values.split(',').map(&:strip)
|
23
|
+
rescue
|
24
|
+
Textoken.expression_err("#{values} are not supported. Correct format,
|
25
|
+
has to be 'numbers' or 'numbers, dates, phones'")
|
26
|
+
end
|
27
|
+
|
28
|
+
def load_file
|
29
|
+
YAML.load_file("#{GEM_ROOT}/lib/textoken/regexps/option_values.yml")
|
30
|
+
end
|
31
|
+
|
32
|
+
# here we do check for option values user supplied
|
33
|
+
# option values has to be declared at option_values.yml
|
34
|
+
def match_keys
|
35
|
+
values.each do |v|
|
36
|
+
Textoken.expression_err("#{v}: is not permitted.") unless yaml.key?(v)
|
37
|
+
add_regexps(yaml[v])
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def add_regexps(arr)
|
42
|
+
@regexps += arr
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Textoken
|
2
|
+
# This is the last step in the process
|
3
|
+
# Takes findings objects and makes finishing punctuation split
|
4
|
+
# Does not split if base.dont_split is true
|
5
|
+
class Tokenizer
|
6
|
+
attr_reader :base, :result, :col
|
7
|
+
|
8
|
+
def initialize(base)
|
9
|
+
@base = base
|
10
|
+
@result = []
|
11
|
+
@col = base.options.collection.length > 0 ? base.findings : base.text
|
12
|
+
end
|
13
|
+
|
14
|
+
def tokens
|
15
|
+
return col if base.dont_split
|
16
|
+
split_punctuations
|
17
|
+
@result
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def split_punctuations
|
23
|
+
col.each do |w|
|
24
|
+
@result += Scanner.new(w, default_regexp).result || [w]
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# will be used for finding punctuations
|
29
|
+
def default_regexp
|
30
|
+
Regexp.new('\W+')
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/lib/textoken.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
|
3
|
+
require 'textoken/version'
|
4
|
+
require 'textoken/base'
|
5
|
+
require 'textoken/errors'
|
6
|
+
require 'textoken/searcher'
|
7
|
+
require 'textoken/options'
|
8
|
+
require 'textoken/findings'
|
9
|
+
require 'textoken/tokenizer'
|
10
|
+
require 'textoken/scanner'
|
11
|
+
|
12
|
+
require 'textoken/options/less_than'
|
13
|
+
require 'textoken/options/more_than'
|
14
|
+
require 'textoken/options/only'
|
15
|
+
require 'textoken/options/exclude'
|
16
|
+
|
17
|
+
require 'textoken/factories/option_factory'
|
18
|
+
|
19
|
+
# Textoken is a library for customizable tokenization of texts.
|
20
|
+
# Customizable tokenization can be used in many areas including NLP purposes
|
21
|
+
module Textoken
|
22
|
+
GEM_ROOT = File.expand_path('../..', __FILE__)
|
23
|
+
|
24
|
+
class << self
|
25
|
+
# Expression error raised in wrong user input of options & values
|
26
|
+
def expression_err(msg)
|
27
|
+
fail ExpressionError, msg
|
28
|
+
end
|
29
|
+
|
30
|
+
# Type error raised when user input format is right but the option value
|
31
|
+
# is not suitable for option
|
32
|
+
def type_err(msg)
|
33
|
+
fail TypeError, msg
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# A shortcut to initialize gem
|
39
|
+
# Textoken('I had rather be first in a village than second at Rome.')
|
40
|
+
def Textoken(text, options = nil)
|
41
|
+
Textoken::Base.new(text, options)
|
42
|
+
end
|
metadata
ADDED
@@ -0,0 +1,110 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: textoken
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Mehmet Cetin
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-10-03 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rspec
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 3.3.0
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 3.3.0
|
23
|
+
type: :development
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
requirements:
|
27
|
+
- - "~>"
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 3.3.0
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 3.3.0
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: rake
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - "~>"
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '10.0'
|
40
|
+
type: :development
|
41
|
+
prerelease: false
|
42
|
+
version_requirements: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - "~>"
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '10.0'
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: pry
|
49
|
+
requirement: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - "~>"
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - "~>"
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '0'
|
61
|
+
description: "Textoken is a Ruby library for text tokenization. \n This gem extracts
|
62
|
+
words from text with many customizations. \n It can be used in many fields like
|
63
|
+
crawling and Natural Language Processing."
|
64
|
+
email:
|
65
|
+
- mcetin.cm@gmail.com
|
66
|
+
executables: []
|
67
|
+
extensions: []
|
68
|
+
extra_rdoc_files: []
|
69
|
+
files:
|
70
|
+
- Rakefile
|
71
|
+
- lib/textoken.rb
|
72
|
+
- lib/textoken/base.rb
|
73
|
+
- lib/textoken/errors.rb
|
74
|
+
- lib/textoken/factories/option_factory.rb
|
75
|
+
- lib/textoken/findings.rb
|
76
|
+
- lib/textoken/options.rb
|
77
|
+
- lib/textoken/options/exclude.rb
|
78
|
+
- lib/textoken/options/less_than.rb
|
79
|
+
- lib/textoken/options/more_than.rb
|
80
|
+
- lib/textoken/options/only.rb
|
81
|
+
- lib/textoken/regexps/option_values.yml
|
82
|
+
- lib/textoken/scanner.rb
|
83
|
+
- lib/textoken/searcher.rb
|
84
|
+
- lib/textoken/tokenizer.rb
|
85
|
+
- lib/textoken/version.rb
|
86
|
+
homepage: https://github.com/manorie/textoken
|
87
|
+
licenses:
|
88
|
+
- MIT
|
89
|
+
metadata: {}
|
90
|
+
post_install_message:
|
91
|
+
rdoc_options: []
|
92
|
+
require_paths:
|
93
|
+
- lib
|
94
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
95
|
+
requirements:
|
96
|
+
- - ">="
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: '0'
|
99
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
requirements: []
|
105
|
+
rubyforge_project:
|
106
|
+
rubygems_version: 2.4.5.1
|
107
|
+
signing_key:
|
108
|
+
specification_version: 4
|
109
|
+
summary: Simple and customizable text tokenization gem.
|
110
|
+
test_files: []
|