textoken 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: cf70d771ada2f8b5edaec1b2f6b4c97a4efa563a
4
+ data.tar.gz: eead0c387504164a302e687fa5858b1bf270d972
5
+ SHA512:
6
+ metadata.gz: 13f3ff9bd02f7b57954e791ea0c068c3c1b529d7056a7067fa66100c0992e1299cfa0e897a7b27522fa557dfbb6290b2191bf8654a366df38ceb209a30ae9c7f
7
+ data.tar.gz: d691c3914c7cb78dd396d80d9b8524e2d5e1b3e2786a8518da1558c13916937b59692f29b7723e963aa935c5e42e764c608e9c80d5d7e1b705a1d331e3dfd2c7
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ require "rake"
5
+
6
+ RSpec::Core::RakeTask.new
7
+
8
+ task default: :spec
9
+ task test: :spec
@@ -0,0 +1,39 @@
1
+ module Textoken
2
+ # Inits options, findings and responds to tokens
3
+ # Does not raise error when text or options are nil
4
+ # Splits the text and makes it ready for other operations
5
+ class Base
6
+ attr_reader :text, :dont_split, :findings, :options
7
+
8
+ def initialize(text, opt = nil)
9
+ @text = initial_split(text)
10
+ @options = Options.new(opt)
11
+ end
12
+
13
+ # we do take intersection array of results
14
+ # returning from multiple options
15
+ def tokens
16
+ options.collection.each do |option|
17
+ if @findings.nil?
18
+ @findings = option.tokenize(self)
19
+ else
20
+ @findings &= option.tokenize(self)
21
+ end
22
+ end
23
+
24
+ Tokenizer.new(self).tokens
25
+ end
26
+
27
+ def words
28
+ # tokenize options but do not make the last split
29
+ @dont_split = true
30
+ tokens
31
+ end
32
+
33
+ private
34
+
35
+ def initial_split(text)
36
+ text ? text.split(' ') : []
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,7 @@
1
+ module Textoken
2
+ class ExpressionError < Exception
3
+ end
4
+
5
+ class TypeError < Exception
6
+ end
7
+ end
@@ -0,0 +1,23 @@
1
+ module Textoken
2
+ # This factory created option objects throug user input option
3
+ # User input option like 'more_than: 3' (more_than) gets camelized
4
+ # and objects get initialized with error handling
5
+ module OptionFactory
6
+ def self.build(key, value)
7
+ option_klass(key).new(value)
8
+ end
9
+
10
+ private
11
+
12
+ def self.camelize(key)
13
+ key.to_s.split('_').map(&:capitalize).join
14
+ end
15
+
16
+ def self.option_klass(key)
17
+ # add a module to make other classes unaccessible
18
+ Textoken.const_get(camelize(key).to_sym)
19
+ rescue NameError
20
+ Textoken.expression_err("#{key}: is not a valid option.")
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,34 @@
1
+ module Textoken
2
+ # This class duty is to collect splitted words with array index no
3
+ # And also handles making uniq of multiple words in the same index
4
+ # due to Regexps that reveal same/closer results
5
+ class Findings
6
+ def initialize
7
+ @collection = []
8
+ end
9
+
10
+ # Here we will push items to collection array with index number
11
+ # index number will help us to sort and make array unique
12
+ def push(index, word)
13
+ type_check(index, word)
14
+ @collection << [index, word]
15
+ end
16
+
17
+ # collection will return a sorted and unique array of tokens
18
+ def collection
19
+ @collection.uniq { |w| w[0].to_s + w[1] }.sort_by(&:first)
20
+ end
21
+
22
+ # result will return a one dimensional array of words
23
+ def result
24
+ collection.map(&:last)
25
+ end
26
+
27
+ private
28
+
29
+ def type_check(i, word)
30
+ return if word.is_a?(String) && (i.is_a?(Fixnum) || i.is_a?(Float))
31
+ Textoken.type_err("#{word} and #{i} has to be a String and Integer")
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,28 @@
1
+ module Textoken
2
+ # This option object excludes words in text via matching regexp
3
+ # Non-excluded words pushed to findings array
4
+ class Exclude
5
+ attr_reader :regexps, :findings
6
+
7
+ def priority
8
+ 1
9
+ end
10
+
11
+ def initialize(values)
12
+ @regexps = Searcher.new(values).regexps
13
+ @findings = Findings.new
14
+ end
15
+
16
+ # base.text is raw tokens splitted with ' '
17
+ # values are Regexps array to search
18
+ # base.findings, Findings object for pushing matching tokens
19
+ def tokenize(base)
20
+ regexps.each do |r|
21
+ base.text.each_with_index do |t, i|
22
+ findings.push(i, t) unless t.match(r)
23
+ end
24
+ end
25
+ findings.result
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,32 @@
1
+ module Textoken
2
+ # This option object picks words in text with less than length
3
+ # of the option value
4
+ class LessThan
5
+ attr_reader :number, :findings
6
+
7
+ def priority
8
+ 2
9
+ end
10
+
11
+ def initialize(value)
12
+ check_value(value)
13
+ @number = value
14
+ @findings = Findings.new
15
+ end
16
+
17
+ def tokenize(base)
18
+ base.text.each_with_index do |w, i|
19
+ findings.push(i, w) if w.length < number
20
+ end
21
+ findings.result
22
+ end
23
+
24
+ private
25
+
26
+ def check_value(value)
27
+ return if value.class == Fixnum && value > 1
28
+ Textoken.type_err "value #{value} is not permitted for
29
+ less_than option it has to be 2 at least."
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,32 @@
1
+ module Textoken
2
+ # This option object picks words in text with more than length
3
+ # of the option value
4
+ class MoreThan
5
+ attr_reader :number, :findings
6
+
7
+ def priority
8
+ 2
9
+ end
10
+
11
+ def initialize(value)
12
+ check_value(value)
13
+ @number = value
14
+ @findings = Findings.new
15
+ end
16
+
17
+ def tokenize(base)
18
+ base.text.each_with_index do |w, i|
19
+ findings.push(i, w) if w.length > number
20
+ end
21
+ findings.result
22
+ end
23
+
24
+ private
25
+
26
+ def check_value(value)
27
+ return if value.class == Fixnum && value >= 0
28
+ Textoken.type_err "value #{value} is not permitted for
29
+ more_than option it has to be 0 at least."
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,28 @@
1
+ module Textoken
2
+ # This option object selects words in text via matching regexp
3
+ # regexp should be defined in option_values.yml
4
+ class Only
5
+ attr_reader :regexps, :findings
6
+
7
+ def priority
8
+ 1
9
+ end
10
+
11
+ def initialize(values)
12
+ @regexps = Searcher.new(values).regexps
13
+ @findings = Findings.new
14
+ end
15
+
16
+ # base.text is raw tokens splitted with ' '
17
+ # values are Regexps array to search
18
+ # base.findings, Findings object for pushing matching tokens
19
+ def tokenize(base)
20
+ regexps.each do |r|
21
+ base.text.each_with_index do |t, i|
22
+ findings.push(i, t) if t.match(r)
23
+ end
24
+ end
25
+ findings.result
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,36 @@
1
+ module Textoken
2
+ # Creates collection array, checks basic hash format.
3
+ # Responds to collection as an array of options sorted by
4
+ # priority. Does not raise an exception when options are nil
5
+ class Options
6
+ attr_reader :options
7
+
8
+ def initialize(opt)
9
+ @options = opt
10
+ @collection = []
11
+ check_options_format
12
+ end
13
+
14
+ def collection
15
+ options.each { |k, v| init_option(k, v) } if options
16
+ sort_collection
17
+ @collection
18
+ end
19
+
20
+ private
21
+
22
+ def check_options_format
23
+ return if options.nil? || options.is_a?(Hash)
24
+ Textoken.expression_err("#{options} is not a valid format, you can use;
25
+ Textoken('Alfa beta.', exclude: 'dates, phones', more_than: 3)")
26
+ end
27
+
28
+ def init_option(key, value)
29
+ @collection << Textoken::OptionFactory.build(key, value)
30
+ end
31
+
32
+ def sort_collection
33
+ @collection.sort! { |a, b| a.priority <=> b.priority }
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,22 @@
1
+ # This file stands for options you can pass as an argument like,
2
+ # only: 'dates, numbers, etc..' or exclude: 'dates'
3
+
4
+ # You can add many regexps under one type, all of them will be searchable
5
+
6
+ punctuations:
7
+ # punctuations
8
+ - \W+
9
+
10
+ numerics:
11
+ - (?:\d*\.)?\d+
12
+ - (?:\d*\,)?\d+
13
+
14
+ phones:
15
+ # covers p:444-555-1234 f:246.555.8888 m:1235554567
16
+ - \b\d{3}[-.]?\d{3}[-.]?\d{4}\b
17
+
18
+ dates:
19
+ # mm/dd/yyyy format
20
+ - ^(0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])[- /.](19|20)\d\d$
21
+ # dd-mm-yyyy format
22
+ - ^(0[1-9]|[12][0-9]|3[01])[- /.](0[1-9]|1[012])[- /.](19|20)\d\d$
@@ -0,0 +1,33 @@
1
+ module Textoken
2
+ # Scanner finds tokens in a word with regexp
3
+ # If word does not match regexp returns an empty Array
4
+ class Scanner
5
+ attr_reader :word, :regexp
6
+
7
+ def initialize(word, regexp)
8
+ @word = word
9
+ @regexp = regexp
10
+ check_types
11
+ end
12
+
13
+ def result
14
+ scan = word.scan(regexp)
15
+ scan.length > 0 ? partition(scan, word) : nil
16
+ end
17
+
18
+ private
19
+
20
+ def partition(scan, word)
21
+ scan.each do |p|
22
+ word = word.gsub(p, ' ' + p + ' ')
23
+ end
24
+ word.split(' ')
25
+ end
26
+
27
+ def check_types
28
+ return if word.is_a?(String) && regexp.is_a?(Regexp)
29
+ Textoken.type_err("#{regexp} should be Regexp and #{word}
30
+ has to be a string.")
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,45 @@
1
+ module Textoken
2
+ # This class reads YAML file under searchables/values,
3
+ # returns array of regexps representing values passed as an argument
4
+ # todo: add regexp support
5
+ class Searcher
6
+ attr_reader :values, :yaml
7
+
8
+ def initialize(values)
9
+ @values = check_and_init_values(values)
10
+ @yaml = load_file
11
+ @regexps = []
12
+ end
13
+
14
+ def regexps
15
+ match_keys
16
+ @regexps.map.each { |r| Regexp.new(r) }
17
+ end
18
+
19
+ private
20
+
21
+ def check_and_init_values(values)
22
+ values.split(',').map(&:strip)
23
+ rescue
24
+ Textoken.expression_err("#{values} are not supported. Correct format,
25
+ has to be 'numbers' or 'numbers, dates, phones'")
26
+ end
27
+
28
+ def load_file
29
+ YAML.load_file("#{GEM_ROOT}/lib/textoken/regexps/option_values.yml")
30
+ end
31
+
32
+ # here we do check for option values user supplied
33
+ # option values has to be declared at option_values.yml
34
+ def match_keys
35
+ values.each do |v|
36
+ Textoken.expression_err("#{v}: is not permitted.") unless yaml.key?(v)
37
+ add_regexps(yaml[v])
38
+ end
39
+ end
40
+
41
+ def add_regexps(arr)
42
+ @regexps += arr
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,33 @@
1
+ module Textoken
2
+ # This is the last step in the process
3
+ # Takes findings objects and makes finishing punctuation split
4
+ # Does not split if base.dont_split is true
5
+ class Tokenizer
6
+ attr_reader :base, :result, :col
7
+
8
+ def initialize(base)
9
+ @base = base
10
+ @result = []
11
+ @col = base.options.collection.length > 0 ? base.findings : base.text
12
+ end
13
+
14
+ def tokens
15
+ return col if base.dont_split
16
+ split_punctuations
17
+ @result
18
+ end
19
+
20
+ private
21
+
22
+ def split_punctuations
23
+ col.each do |w|
24
+ @result += Scanner.new(w, default_regexp).result || [w]
25
+ end
26
+ end
27
+
28
+ # will be used for finding punctuations
29
+ def default_regexp
30
+ Regexp.new('\W+')
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,3 @@
1
+ module Textoken
2
+ VERSION = "1.0.0"
3
+ end
data/lib/textoken.rb ADDED
@@ -0,0 +1,42 @@
1
+ require 'yaml'
2
+
3
+ require 'textoken/version'
4
+ require 'textoken/base'
5
+ require 'textoken/errors'
6
+ require 'textoken/searcher'
7
+ require 'textoken/options'
8
+ require 'textoken/findings'
9
+ require 'textoken/tokenizer'
10
+ require 'textoken/scanner'
11
+
12
+ require 'textoken/options/less_than'
13
+ require 'textoken/options/more_than'
14
+ require 'textoken/options/only'
15
+ require 'textoken/options/exclude'
16
+
17
+ require 'textoken/factories/option_factory'
18
+
19
+ # Textoken is a library for customizable tokenization of texts.
20
+ # Customizable tokenization can be used in many areas including NLP purposes
21
+ module Textoken
22
+ GEM_ROOT = File.expand_path('../..', __FILE__)
23
+
24
+ class << self
25
+ # Expression error raised in wrong user input of options & values
26
+ def expression_err(msg)
27
+ fail ExpressionError, msg
28
+ end
29
+
30
+ # Type error raised when user input format is right but the option value
31
+ # is not suitable for option
32
+ def type_err(msg)
33
+ fail TypeError, msg
34
+ end
35
+ end
36
+ end
37
+
38
+ # A shortcut to initialize gem
39
+ # Textoken('I had rather be first in a village than second at Rome.')
40
+ def Textoken(text, options = nil)
41
+ Textoken::Base.new(text, options)
42
+ end
metadata ADDED
@@ -0,0 +1,110 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: textoken
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Mehmet Cetin
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-10-03 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 3.3.0
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 3.3.0
23
+ type: :development
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: 3.3.0
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 3.3.0
33
+ - !ruby/object:Gem::Dependency
34
+ name: rake
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '10.0'
40
+ type: :development
41
+ prerelease: false
42
+ version_requirements: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '10.0'
47
+ - !ruby/object:Gem::Dependency
48
+ name: pry
49
+ requirement: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ description: "Textoken is a Ruby library for text tokenization. \n This gem extracts
62
+ words from text with many customizations. \n It can be used in many fields like
63
+ crawling and Natural Language Processing."
64
+ email:
65
+ - mcetin.cm@gmail.com
66
+ executables: []
67
+ extensions: []
68
+ extra_rdoc_files: []
69
+ files:
70
+ - Rakefile
71
+ - lib/textoken.rb
72
+ - lib/textoken/base.rb
73
+ - lib/textoken/errors.rb
74
+ - lib/textoken/factories/option_factory.rb
75
+ - lib/textoken/findings.rb
76
+ - lib/textoken/options.rb
77
+ - lib/textoken/options/exclude.rb
78
+ - lib/textoken/options/less_than.rb
79
+ - lib/textoken/options/more_than.rb
80
+ - lib/textoken/options/only.rb
81
+ - lib/textoken/regexps/option_values.yml
82
+ - lib/textoken/scanner.rb
83
+ - lib/textoken/searcher.rb
84
+ - lib/textoken/tokenizer.rb
85
+ - lib/textoken/version.rb
86
+ homepage: https://github.com/manorie/textoken
87
+ licenses:
88
+ - MIT
89
+ metadata: {}
90
+ post_install_message:
91
+ rdoc_options: []
92
+ require_paths:
93
+ - lib
94
+ required_ruby_version: !ruby/object:Gem::Requirement
95
+ requirements:
96
+ - - ">="
97
+ - !ruby/object:Gem::Version
98
+ version: '0'
99
+ required_rubygems_version: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ requirements: []
105
+ rubyforge_project:
106
+ rubygems_version: 2.4.5.1
107
+ signing_key:
108
+ specification_version: 4
109
+ summary: Simple and customizable text tokenization gem.
110
+ test_files: []