textoken 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: cf70d771ada2f8b5edaec1b2f6b4c97a4efa563a
4
+ data.tar.gz: eead0c387504164a302e687fa5858b1bf270d972
5
+ SHA512:
6
+ metadata.gz: 13f3ff9bd02f7b57954e791ea0c068c3c1b529d7056a7067fa66100c0992e1299cfa0e897a7b27522fa557dfbb6290b2191bf8654a366df38ceb209a30ae9c7f
7
+ data.tar.gz: d691c3914c7cb78dd396d80d9b8524e2d5e1b3e2786a8518da1558c13916937b59692f29b7723e963aa935c5e42e764c608e9c80d5d7e1b705a1d331e3dfd2c7
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ require "rake"
5
+
6
+ RSpec::Core::RakeTask.new
7
+
8
+ task default: :spec
9
+ task test: :spec
@@ -0,0 +1,39 @@
1
+ module Textoken
2
+ # Inits options, findings and responds to tokens
3
+ # Does not raise error when text or options are nil
4
+ # Splits the text and makes it ready for other operations
5
+ class Base
6
+ attr_reader :text, :dont_split, :findings, :options
7
+
8
+ def initialize(text, opt = nil)
9
+ @text = initial_split(text)
10
+ @options = Options.new(opt)
11
+ end
12
+
13
+ # we do take intersection array of results
14
+ # returning from multiple options
15
+ def tokens
16
+ options.collection.each do |option|
17
+ if @findings.nil?
18
+ @findings = option.tokenize(self)
19
+ else
20
+ @findings &= option.tokenize(self)
21
+ end
22
+ end
23
+
24
+ Tokenizer.new(self).tokens
25
+ end
26
+
27
+ def words
28
+ # tokenize options but do not make the last split
29
+ @dont_split = true
30
+ tokens
31
+ end
32
+
33
+ private
34
+
35
+ def initial_split(text)
36
+ text ? text.split(' ') : []
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,7 @@
1
+ module Textoken
2
+ class ExpressionError < Exception
3
+ end
4
+
5
+ class TypeError < Exception
6
+ end
7
+ end
@@ -0,0 +1,23 @@
1
+ module Textoken
2
+ # This factory created option objects throug user input option
3
+ # User input option like 'more_than: 3' (more_than) gets camelized
4
+ # and objects get initialized with error handling
5
+ module OptionFactory
6
+ def self.build(key, value)
7
+ option_klass(key).new(value)
8
+ end
9
+
10
+ private
11
+
12
+ def self.camelize(key)
13
+ key.to_s.split('_').map(&:capitalize).join
14
+ end
15
+
16
+ def self.option_klass(key)
17
+ # add a module to make other classes unaccessible
18
+ Textoken.const_get(camelize(key).to_sym)
19
+ rescue NameError
20
+ Textoken.expression_err("#{key}: is not a valid option.")
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,34 @@
1
+ module Textoken
2
+ # This class duty is to collect splitted words with array index no
3
+ # And also handles making uniq of multiple words in the same index
4
+ # due to Regexps that reveal same/closer results
5
+ class Findings
6
+ def initialize
7
+ @collection = []
8
+ end
9
+
10
+ # Here we will push items to collection array with index number
11
+ # index number will help us to sort and make array unique
12
+ def push(index, word)
13
+ type_check(index, word)
14
+ @collection << [index, word]
15
+ end
16
+
17
+ # collection will return a sorted and unique array of tokens
18
+ def collection
19
+ @collection.uniq { |w| w[0].to_s + w[1] }.sort_by(&:first)
20
+ end
21
+
22
+ # result will return a one dimensional array of words
23
+ def result
24
+ collection.map(&:last)
25
+ end
26
+
27
+ private
28
+
29
+ def type_check(i, word)
30
+ return if word.is_a?(String) && (i.is_a?(Fixnum) || i.is_a?(Float))
31
+ Textoken.type_err("#{word} and #{i} has to be a String and Integer")
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,28 @@
1
+ module Textoken
2
+ # This option object excludes words in text via matching regexp
3
+ # Non-excluded words pushed to findings array
4
+ class Exclude
5
+ attr_reader :regexps, :findings
6
+
7
+ def priority
8
+ 1
9
+ end
10
+
11
+ def initialize(values)
12
+ @regexps = Searcher.new(values).regexps
13
+ @findings = Findings.new
14
+ end
15
+
16
+ # base.text is raw tokens splitted with ' '
17
+ # values are Regexps array to search
18
+ # base.findings, Findings object for pushing matching tokens
19
+ def tokenize(base)
20
+ regexps.each do |r|
21
+ base.text.each_with_index do |t, i|
22
+ findings.push(i, t) unless t.match(r)
23
+ end
24
+ end
25
+ findings.result
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,32 @@
1
+ module Textoken
2
+ # This option object picks words in text with less than length
3
+ # of the option value
4
+ class LessThan
5
+ attr_reader :number, :findings
6
+
7
+ def priority
8
+ 2
9
+ end
10
+
11
+ def initialize(value)
12
+ check_value(value)
13
+ @number = value
14
+ @findings = Findings.new
15
+ end
16
+
17
+ def tokenize(base)
18
+ base.text.each_with_index do |w, i|
19
+ findings.push(i, w) if w.length < number
20
+ end
21
+ findings.result
22
+ end
23
+
24
+ private
25
+
26
+ def check_value(value)
27
+ return if value.class == Fixnum && value > 1
28
+ Textoken.type_err "value #{value} is not permitted for
29
+ less_than option it has to be 2 at least."
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,32 @@
1
+ module Textoken
2
+ # This option object picks words in text with more than length
3
+ # of the option value
4
+ class MoreThan
5
+ attr_reader :number, :findings
6
+
7
+ def priority
8
+ 2
9
+ end
10
+
11
+ def initialize(value)
12
+ check_value(value)
13
+ @number = value
14
+ @findings = Findings.new
15
+ end
16
+
17
+ def tokenize(base)
18
+ base.text.each_with_index do |w, i|
19
+ findings.push(i, w) if w.length > number
20
+ end
21
+ findings.result
22
+ end
23
+
24
+ private
25
+
26
+ def check_value(value)
27
+ return if value.class == Fixnum && value >= 0
28
+ Textoken.type_err "value #{value} is not permitted for
29
+ more_than option it has to be 0 at least."
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,28 @@
1
+ module Textoken
2
+ # This option object selects words in text via matching regexp
3
+ # regexp should be defined in option_values.yml
4
+ class Only
5
+ attr_reader :regexps, :findings
6
+
7
+ def priority
8
+ 1
9
+ end
10
+
11
+ def initialize(values)
12
+ @regexps = Searcher.new(values).regexps
13
+ @findings = Findings.new
14
+ end
15
+
16
+ # base.text is raw tokens splitted with ' '
17
+ # values are Regexps array to search
18
+ # base.findings, Findings object for pushing matching tokens
19
+ def tokenize(base)
20
+ regexps.each do |r|
21
+ base.text.each_with_index do |t, i|
22
+ findings.push(i, t) if t.match(r)
23
+ end
24
+ end
25
+ findings.result
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,36 @@
1
+ module Textoken
2
+ # Creates collection array, checks basic hash format.
3
+ # Responds to collection as an array of options sorted by
4
+ # priority. Does not raise an exception when options are nil
5
+ class Options
6
+ attr_reader :options
7
+
8
+ def initialize(opt)
9
+ @options = opt
10
+ @collection = []
11
+ check_options_format
12
+ end
13
+
14
+ def collection
15
+ options.each { |k, v| init_option(k, v) } if options
16
+ sort_collection
17
+ @collection
18
+ end
19
+
20
+ private
21
+
22
+ def check_options_format
23
+ return if options.nil? || options.is_a?(Hash)
24
+ Textoken.expression_err("#{options} is not a valid format, you can use;
25
+ Textoken('Alfa beta.', exclude: 'dates, phones', more_than: 3)")
26
+ end
27
+
28
+ def init_option(key, value)
29
+ @collection << Textoken::OptionFactory.build(key, value)
30
+ end
31
+
32
+ def sort_collection
33
+ @collection.sort! { |a, b| a.priority <=> b.priority }
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,22 @@
1
+ # This file stands for options you can pass as an argument like,
2
+ # only: 'dates, numbers, etc..' or exclude: 'dates'
3
+
4
+ # You can add many regexps under one type, all of them will be searchable
5
+
6
+ punctuations:
7
+ # punctuations
8
+ - \W+
9
+
10
+ numerics:
11
+ - (?:\d*\.)?\d+
12
+ - (?:\d*\,)?\d+
13
+
14
+ phones:
15
+ # covers p:444-555-1234 f:246.555.8888 m:1235554567
16
+ - \b\d{3}[-.]?\d{3}[-.]?\d{4}\b
17
+
18
+ dates:
19
+ # mm/dd/yyyy format
20
+ - ^(0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])[- /.](19|20)\d\d$
21
+ # dd-mm-yyyy format
22
+ - ^(0[1-9]|[12][0-9]|3[01])[- /.](0[1-9]|1[012])[- /.](19|20)\d\d$
@@ -0,0 +1,33 @@
1
+ module Textoken
2
+ # Scanner finds tokens in a word with regexp
3
+ # If word does not match regexp returns an empty Array
4
+ class Scanner
5
+ attr_reader :word, :regexp
6
+
7
+ def initialize(word, regexp)
8
+ @word = word
9
+ @regexp = regexp
10
+ check_types
11
+ end
12
+
13
+ def result
14
+ scan = word.scan(regexp)
15
+ scan.length > 0 ? partition(scan, word) : nil
16
+ end
17
+
18
+ private
19
+
20
+ def partition(scan, word)
21
+ scan.each do |p|
22
+ word = word.gsub(p, ' ' + p + ' ')
23
+ end
24
+ word.split(' ')
25
+ end
26
+
27
+ def check_types
28
+ return if word.is_a?(String) && regexp.is_a?(Regexp)
29
+ Textoken.type_err("#{regexp} should be Regexp and #{word}
30
+ has to be a string.")
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,45 @@
1
+ module Textoken
2
+ # This class reads YAML file under searchables/values,
3
+ # returns array of regexps representing values passed as an argument
4
+ # todo: add regexp support
5
+ class Searcher
6
+ attr_reader :values, :yaml
7
+
8
+ def initialize(values)
9
+ @values = check_and_init_values(values)
10
+ @yaml = load_file
11
+ @regexps = []
12
+ end
13
+
14
+ def regexps
15
+ match_keys
16
+ @regexps.map.each { |r| Regexp.new(r) }
17
+ end
18
+
19
+ private
20
+
21
+ def check_and_init_values(values)
22
+ values.split(',').map(&:strip)
23
+ rescue
24
+ Textoken.expression_err("#{values} are not supported. Correct format,
25
+ has to be 'numbers' or 'numbers, dates, phones'")
26
+ end
27
+
28
+ def load_file
29
+ YAML.load_file("#{GEM_ROOT}/lib/textoken/regexps/option_values.yml")
30
+ end
31
+
32
+ # here we do check for option values user supplied
33
+ # option values has to be declared at option_values.yml
34
+ def match_keys
35
+ values.each do |v|
36
+ Textoken.expression_err("#{v}: is not permitted.") unless yaml.key?(v)
37
+ add_regexps(yaml[v])
38
+ end
39
+ end
40
+
41
+ def add_regexps(arr)
42
+ @regexps += arr
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,33 @@
1
+ module Textoken
2
+ # This is the last step in the process
3
+ # Takes findings objects and makes finishing punctuation split
4
+ # Does not split if base.dont_split is true
5
+ class Tokenizer
6
+ attr_reader :base, :result, :col
7
+
8
+ def initialize(base)
9
+ @base = base
10
+ @result = []
11
+ @col = base.options.collection.length > 0 ? base.findings : base.text
12
+ end
13
+
14
+ def tokens
15
+ return col if base.dont_split
16
+ split_punctuations
17
+ @result
18
+ end
19
+
20
+ private
21
+
22
+ def split_punctuations
23
+ col.each do |w|
24
+ @result += Scanner.new(w, default_regexp).result || [w]
25
+ end
26
+ end
27
+
28
+ # will be used for finding punctuations
29
+ def default_regexp
30
+ Regexp.new('\W+')
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,3 @@
1
+ module Textoken
2
+ VERSION = "1.0.0"
3
+ end
data/lib/textoken.rb ADDED
@@ -0,0 +1,42 @@
1
+ require 'yaml'
2
+
3
+ require 'textoken/version'
4
+ require 'textoken/base'
5
+ require 'textoken/errors'
6
+ require 'textoken/searcher'
7
+ require 'textoken/options'
8
+ require 'textoken/findings'
9
+ require 'textoken/tokenizer'
10
+ require 'textoken/scanner'
11
+
12
+ require 'textoken/options/less_than'
13
+ require 'textoken/options/more_than'
14
+ require 'textoken/options/only'
15
+ require 'textoken/options/exclude'
16
+
17
+ require 'textoken/factories/option_factory'
18
+
19
+ # Textoken is a library for customizable tokenization of texts.
20
+ # Customizable tokenization can be used in many areas including NLP purposes
21
+ module Textoken
22
+ GEM_ROOT = File.expand_path('../..', __FILE__)
23
+
24
+ class << self
25
+ # Expression error raised in wrong user input of options & values
26
+ def expression_err(msg)
27
+ fail ExpressionError, msg
28
+ end
29
+
30
+ # Type error raised when user input format is right but the option value
31
+ # is not suitable for option
32
+ def type_err(msg)
33
+ fail TypeError, msg
34
+ end
35
+ end
36
+ end
37
+
38
+ # A shortcut to initialize gem
39
+ # Textoken('I had rather be first in a village than second at Rome.')
40
+ def Textoken(text, options = nil)
41
+ Textoken::Base.new(text, options)
42
+ end
metadata ADDED
@@ -0,0 +1,110 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: textoken
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Mehmet Cetin
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-10-03 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 3.3.0
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 3.3.0
23
+ type: :development
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: 3.3.0
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 3.3.0
33
+ - !ruby/object:Gem::Dependency
34
+ name: rake
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '10.0'
40
+ type: :development
41
+ prerelease: false
42
+ version_requirements: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '10.0'
47
+ - !ruby/object:Gem::Dependency
48
+ name: pry
49
+ requirement: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ description: "Textoken is a Ruby library for text tokenization. \n This gem extracts
62
+ words from text with many customizations. \n It can be used in many fields like
63
+ crawling and Natural Language Processing."
64
+ email:
65
+ - mcetin.cm@gmail.com
66
+ executables: []
67
+ extensions: []
68
+ extra_rdoc_files: []
69
+ files:
70
+ - Rakefile
71
+ - lib/textoken.rb
72
+ - lib/textoken/base.rb
73
+ - lib/textoken/errors.rb
74
+ - lib/textoken/factories/option_factory.rb
75
+ - lib/textoken/findings.rb
76
+ - lib/textoken/options.rb
77
+ - lib/textoken/options/exclude.rb
78
+ - lib/textoken/options/less_than.rb
79
+ - lib/textoken/options/more_than.rb
80
+ - lib/textoken/options/only.rb
81
+ - lib/textoken/regexps/option_values.yml
82
+ - lib/textoken/scanner.rb
83
+ - lib/textoken/searcher.rb
84
+ - lib/textoken/tokenizer.rb
85
+ - lib/textoken/version.rb
86
+ homepage: https://github.com/manorie/textoken
87
+ licenses:
88
+ - MIT
89
+ metadata: {}
90
+ post_install_message:
91
+ rdoc_options: []
92
+ require_paths:
93
+ - lib
94
+ required_ruby_version: !ruby/object:Gem::Requirement
95
+ requirements:
96
+ - - ">="
97
+ - !ruby/object:Gem::Version
98
+ version: '0'
99
+ required_rubygems_version: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ requirements: []
105
+ rubyforge_project:
106
+ rubygems_version: 2.4.5.1
107
+ signing_key:
108
+ specification_version: 4
109
+ summary: Simple and customizable text tokenization gem.
110
+ test_files: []