hashrules 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ MDJhZmM2MWMyMTkzNjc3NjkwNjYzZmRlMjkwMjllNjFkZmJlNmRlYw==
5
+ data.tar.gz: !binary |-
6
+ YjZiYTk1NDBkY2I1MzgzNmY4YmI5NGRhYTkyMDkwOGU4MTY0ODY5ZQ==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ YTgyMzk0NzQ3MDdiOWNiM2JlMjM3ZmM4ZTIwYTc3YWI2NDliNWEzNzE2OGRh
10
+ MWIwNGM3ZGI0MWU5YzhhYmFiMDBkN2ZkYzE1ZDUxYzE4ODVjOGVmMzg1Zjdk
11
+ ZTQ5MDdhNTc1MWE3YzNiMDBhZjA0MWM1ZjRjMjdmYmFmM2YwNDg=
12
+ data.tar.gz: !binary |-
13
+ MzRiMGI0N2NmYTk5NDhkZWM5MDQxNDM3ODU4ZDM2Y2Q5YWQ5YmJiM2E2M2Vl
14
+ ZmQxMDQyYzZmZjk3ZDU3OTk3Yjc3Mzg5ZmMxZTVhOThkN2VhZWI4OWNhMTQz
15
+ M2U3NjNiNjRjMzgzMmEzZDNjZjYyMmZhMjM0NWM0N2Y3N2U2NWM=
data/CHANGELOG.mdown ADDED
@@ -0,0 +1,52 @@
1
+
2
+ # ChangeLog
3
+
4
+ ## 1.1.4
5
+
6
+ * Bumped version due to need to republish to RubyGems (last version did not include all files)
7
+
8
+ ## 1.1.3
9
+
10
+ * Improved cache hits
11
+
12
+ ## 1.1.2
13
+
14
+ * Sort results that can explain more matches above those with less, even if the others covers more characters
15
+
16
+ ## 1.1.1
17
+
18
+ * Percent coverage should include spaces between matches, otherwise a fully covered string would never reach 100%
19
+
20
+ ## 1.1.0
21
+
22
+ * Added result caching
23
+
24
+ ## 1.0.3
25
+
26
+ * Bugfix: Some cases would sort a low-level submatch over a higher one. Now matchlevel is set accordingly and sorted with if coverage are equal
27
+
28
+ ## 1.0.2
29
+
30
+ * Bugfix: Coverage was not set for parent matchers in the case of a successful submatch. This meant that sometimes a first-level match would be ranked lower (treated as a submatch) when another submatch was found.
31
+
32
+ ## 1.0.1
33
+
34
+ * Bugfix: When using smart search / submatching - don't count whitespace as coverage. This caused /germany/ in first level to gain less priority than /( |^)germany( |$)/ in USA (second level).
35
+ * Bugfix: Fix a certain case where a Limit set to 1 would gain -1 (unlimited) interpretation
36
+
37
+ ## 1.0.0
38
+
39
+ This is a major version which breaks backwards compatibility.
40
+
41
+ * Changed API: HashRules#process now takes a single string and returns a new hash
42
+ * Added submatch feature to find results (deeply) nested even if a top level couldn't match.
43
+ * HashRules no longer cleans string before process (such as removing - and /), this is now the responsibility of the application.
44
+
45
+ ## 0.2.3
46
+
47
+ * Fix issue with using both and strings (it would only accept no-class and regex before)
48
+
49
+ ## 0.2
50
+
51
+ * Add matcher keywords both() for applying AND operator and no() for negating an expression
52
+ * Change name to sc-hashrules
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gem 'turn'
4
+ gem 'shoulda'
5
+ gem 'mocha'
6
+ gem 'htmlentities'
7
+ gem 'rake'
8
+ gem 'awesome_print'
data/Gemfile.lock ADDED
@@ -0,0 +1,37 @@
1
+ GEM
2
+ remote: https://rubygems.org/
3
+ specs:
4
+ activesupport (3.2.9)
5
+ i18n (~> 0.6)
6
+ multi_json (~> 1.0)
7
+ ansi (1.4.3)
8
+ awesome_print (1.0.1)
9
+ bourne (1.1.2)
10
+ mocha (= 0.10.5)
11
+ htmlentities (4.3.1)
12
+ i18n (0.6.1)
13
+ metaclass (0.0.1)
14
+ mocha (0.10.5)
15
+ metaclass (~> 0.0.1)
16
+ multi_json (1.5.0)
17
+ rake (10.0.3)
18
+ shoulda (3.3.2)
19
+ shoulda-context (~> 1.0.1)
20
+ shoulda-matchers (~> 1.4.1)
21
+ shoulda-context (1.0.1)
22
+ shoulda-matchers (1.4.2)
23
+ activesupport (>= 3.0.0)
24
+ bourne (~> 1.1.2)
25
+ turn (0.9.6)
26
+ ansi
27
+
28
+ PLATFORMS
29
+ ruby
30
+
31
+ DEPENDENCIES
32
+ awesome_print
33
+ htmlentities
34
+ mocha
35
+ rake
36
+ shoulda
37
+ turn
data/LICENSE.txt ADDED
@@ -0,0 +1,4 @@
1
+ HashRules by Mikael Wikman is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.
2
+
3
+ For more information, please visit:
4
+ http://creativecommons.org/licenses/by-sa/3.0/
data/README.mdown ADDED
@@ -0,0 +1,143 @@
1
+
2
+ # HashRules
3
+
4
+ A simple, yet powerful, gem for building constructions for identifying contents in a string in a structured, hierarchical, manner.
5
+
6
+ ## Example
7
+
8
+ Suppose you're building a system for identifying countries, regions and cities within a string. An exerpt might look like this:
9
+
10
+ ```ruby
11
+ match 'united states', w(/u ?s ?a/) do
12
+ set country: 'United States'
13
+
14
+ match /( |^)cali/, 'ca' do
15
+ set state: 'California'
16
+ set region: 'West Coast'
17
+
18
+ match 'long beach', 'longbeach' do
19
+ set city: 'Long Beach'
20
+ end
21
+
22
+ match 'fontana' do
23
+ set city: 'Fontana'
24
+ end
25
+ end
26
+ end
27
+ ```
28
+
29
+ This simple DSL translates to a series of OR and AND statements. For example, if we were to write the exact same logic but in plain if/then/else and regexes; the path for matching all the way to 'Long Beach' would be:
30
+
31
+ ```ruby
32
+ text = "Long Beach, California, United States"
33
+ text.gsub!(',','')
34
+ result = {}
35
+
36
+ if (text =~ /( |^)united states( |$)/ || text =~ /( |^)u ?s ?a( |$)/)
37
+ result['country'] = 'United States'
38
+
39
+ if (text =~ /( |^)cali/ || text =~ /( |^)ca( |$)/)
40
+ result['state'] = 'California'
41
+ result['region'] = 'West Coast'
42
+
43
+ if (text =~ /( |^)long beach( |$)/ || text =~ /( |^)longbeach( |$)/)
44
+ result['city'] = 'Long Beach'
45
+ elsif text =~ /( |^)fontana( |$)/
46
+ result['city'] = 'Fontana'
47
+ end
48
+ end
49
+ end
50
+ ```
51
+
52
+ ## Available matchers
53
+ <table>
54
+ <tr>
55
+ <th>Notation</th><th>Equivalence</th><th>Use case</th>
56
+ </tr>
57
+ <tr>
58
+ <td>
59
+ w(regex)
60
+ </td>
61
+ <td>
62
+ /( |^)#{regex}( |$)/
63
+ </td>
64
+ <td>
65
+ Create regex matching only whole words
66
+ </td>
67
+ </tr>
68
+
69
+ <tr>
70
+ <td>
71
+ 'regex'
72
+ </td>
73
+ <td>
74
+ /( |^)#{Regexp.escape(regex)}( |$)/
75
+ </td>
76
+ <td>
77
+ String matching whole words. 'key' would match "I have a key" but not "monkey"
78
+ </td>
79
+ </tr>
80
+
81
+ <tr>
82
+ <td>
83
+ no(matcher)
84
+ </td>
85
+ <td>
86
+ !(matcher.success?)
87
+ </td>
88
+ <td>
89
+ Inverted string or regex matcher
90
+ </td>
91
+ </tr>
92
+
93
+ <tr>
94
+ <td>
95
+ both(A,B)
96
+ </td>
97
+ <td>
98
+ A.success? && B.success?
99
+ </td>
100
+ <td>
101
+ Logical AND of given matchers, which may be nested
102
+ </td>
103
+ </tr>
104
+ </table>
105
+
106
+ ## Installation
107
+
108
+ `gem install hashrules`
109
+
110
+ ## Usage
111
+
112
+ ```
113
+ require 'hash_rules'
114
+
115
+ rules = HashRules.new(args)
116
+
117
+ # args:
118
+ # * folder: path to the folder containing one or more rule files (all will be read)
119
+
120
+ process_args={}
121
+ results = rules.process('Alabama, Canada', process_args)
122
+
123
+ results.each do |result|
124
+ p result
125
+ end
126
+ ```
127
+
128
+ Each result contains the following keys:
129
+
130
+ * data: This is the resulting data accumulated from the rules `set` statements
131
+ * coverage: An array of index pairs showing what part of the string was matched.
132
+ * match_id: Indicates the deepest matching rule id
133
+ * matchlevel: Indicates how deeply nested the first match was. If `max_submatch_level` is zero, this will always be 1.
134
+ * percent_coverage: A number 0 to 100 indicating how many percent of the string was matched
135
+
136
+ Process_args may be any of the following:
137
+
138
+ * max_submatch_level: Allow search to start from a nested matcher. If this value is 1, this means HashRules may skip at the very most one level. Recommended to be used with limit: -1
139
+ * limit: Stop search when this number of matches have been found. Good for performance, but not recommended if `max_submatch_level` is more than zero because results won't guarantee best match.
140
+
141
+ ## License
142
+
143
+ <a rel="license" href="http://creativecommons.org/licenses/by-sa/3.0/deed.en_US"><img alt="Creative Commons License" style="border-width:0" src="http://i.creativecommons.org/l/by-sa/3.0/80x15.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" property="dct:title">HashRules</span> by <span xmlns:cc="http://creativecommons.org/ns#" property="cc:attributionName">Mikael Wikman</span> is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/3.0/deed.en_US">Creative Commons Attribution-ShareAlike 3.0 Unported License</a>.
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ require 'rake/testtask'
2
+
3
+ Rake::TestTask.new do |t|
4
+ t.libs << 'test'
5
+ t.test_files = FileList['test/**/*_test.rb']
6
+ t.verbose = true
7
+ end
8
+
9
+ task :default => :test
data/app.gemspec ADDED
@@ -0,0 +1,16 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |gem|
4
+ gem.authors = ['Mikael Wikman']
5
+ gem.email = ['mikael@swedcontent.com']
6
+ gem.description = %q{Rule-based hash manipulator using custom DSL}
7
+ gem.summary = %q{ }
8
+ gem.homepage = "https://github.com/mikaelwikman/hashrules"
9
+
10
+ gem.files = `git ls-files`.split("\n")
11
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
12
+ gem.test_files = gem.files.grep(%r{^(test|features)/})
13
+ gem.name = "hashrules"
14
+ gem.require_paths = ["lib"]
15
+ gem.version = '1.1.4'
16
+ end
@@ -0,0 +1,167 @@
1
+ class HashMatcher
2
+ attr_reader :rules, :sets
3
+
4
+ def initialize
5
+ @rules = []
6
+ @sets = {}
7
+ @context = self
8
+ end
9
+
10
+ def include_folder folder
11
+ Dir["#{folder}/*.rb"].each do |file|
12
+ @current_folder = folder
13
+ contents = File.read(file)
14
+ eval(contents, binding)
15
+ end
16
+ end
17
+
18
+ def include_subfolder folder
19
+ include_folder "#{@current_folder}/#{folder}"
20
+ end
21
+
22
+ def to_s i=0
23
+ result = ""
24
+ sets.each do |k,v|
25
+ result << "#{" "*i}#{k} = #{v}\n"
26
+ end
27
+ rules.each do |regexes, matcher|
28
+ result << "\n#{" "*i}If match #{regexes.to_s}\n"
29
+ result << matcher.to_s(i+1)
30
+ result << "#{" "*i}End\n"
31
+ end
32
+ result
33
+ end
34
+
35
+ def analyze string, opts={}, level=1
36
+ matches = []
37
+ opts[:limit] ||= 1
38
+ skip_levels = opts[:skip_levels] || 0
39
+
40
+ rules.each do |regexes, matcher|
41
+ offsets = []
42
+ if regexes.find{|r| offsets=test(string,r)} || skip_levels > 0
43
+ opts[:skip_levels] = skip_levels-1
44
+ sub_matches = matcher.analyze(string, opts, level+1)
45
+
46
+ sub_matches.map do |m|
47
+ m['data'] = sets.merge(m['data'])
48
+ if offsets
49
+ m['matchlevel'] = level
50
+ offsets.each do |offset|
51
+ start, stop = offset
52
+
53
+ stop -= 1
54
+
55
+ start += 1 if string[start] == ' '
56
+ stop -= 1 if string[stop] == ' '
57
+
58
+ m['coverage'] << [start,stop]
59
+ end
60
+ end
61
+ end
62
+
63
+ matches += sub_matches
64
+
65
+ if skip_levels <= 0
66
+ opts[:limit] -= 1
67
+ end
68
+
69
+ if (opts[:limit]) == 0
70
+ break
71
+ end
72
+ end
73
+ end
74
+ if matches.empty? && skip_levels < 0
75
+ matches << { 'data' => sets.dup, 'coverage' => [], match_id: self.object_id}
76
+ end
77
+
78
+ matches
79
+ end
80
+
81
+ private
82
+
83
+ def test string, matcher
84
+ if matcher.is_a?(NoClass)
85
+ m = test(string,matcher.regex)
86
+ [[-1,-1]] if !m
87
+ elsif matcher.is_a?(AndClass)
88
+ r = matcher.regexes.map{|r| a=test(string,r); a[0] if a}
89
+ if r.all?{|r| r}
90
+ r.find_all{|r| r[0] != -1}
91
+ end
92
+ else
93
+ m = matcher.match(string)
94
+ [m.offset(0)] if m
95
+ end
96
+ end
97
+
98
+ def set sub_hash
99
+ @context.sets.merge! stringified sub_hash
100
+ end
101
+
102
+ def match *args, &block
103
+ regexes = args.map{|r| to_regex(r)}
104
+ matcher = HashMatcher.new
105
+ old_context = @context
106
+ old_folder = @current_folder
107
+ @context.rules << [regexes, matcher]
108
+ @context = matcher
109
+ block.call
110
+ @context = old_context
111
+ @current_folder = old_folder
112
+ end
113
+
114
+ def w(regex) # make it match whole words
115
+ /(^| )#{regex.source}($| )/
116
+ end
117
+
118
+ def no(regex)
119
+ NoClass.new(to_regex(regex))
120
+ end
121
+
122
+ def both(*regexes)
123
+ AndClass.new(to_regex(regexes))
124
+ end
125
+
126
+ def to_regex(matcher)
127
+ if matcher.kind_of?(String)
128
+ /(^| )#{Regexp.escape(matcher)}($| )/
129
+ elsif matcher.kind_of?(Array)
130
+ matcher.map{|r| to_regex(r)}
131
+ else
132
+ matcher
133
+ end
134
+ end
135
+
136
+ def stringified hash
137
+ hash.keys.each do |key|
138
+ val = hash.delete(key)
139
+ hash[key.to_s] = val
140
+ end
141
+ hash
142
+ end
143
+
144
+ class NoClass
145
+ attr_reader :regex
146
+
147
+ def initialize regex
148
+ @regex = regex
149
+ end
150
+
151
+ def to_s
152
+ "!(#{@regex})"
153
+ end
154
+ end
155
+
156
+ class AndClass
157
+ attr_reader :regexes
158
+
159
+ def initialize regexes
160
+ @regexes = regexes
161
+ end
162
+
163
+ def to_s
164
+ @regexes.map{|r| r.inspect}.join(' AND ')
165
+ end
166
+ end
167
+ end
data/lib/hash_rules.rb ADDED
@@ -0,0 +1,124 @@
1
+ require 'hash_matcher'
2
+ require 'json'
3
+
4
+ class HashRules
5
+
6
+ def initialize args
7
+ @folder = args[:folder] || raise("No folder specified!")
8
+
9
+ @hashmatcher = HashMatcher.new
10
+ @hashmatcher.include_folder(@folder)
11
+ @cache = {}
12
+ end
13
+
14
+ def process string, opts={}
15
+ string = clean_string(string)
16
+
17
+ if cached=@cache[string]
18
+ return Marshal.load(cached)
19
+ end
20
+
21
+ result = Processor.new(string, @hashmatcher, opts).do
22
+
23
+ @cache[string] = Marshal.dump(result)
24
+ result
25
+ end
26
+
27
+ def clean_string string
28
+ string.gsub /\s+/, ' '
29
+ end
30
+
31
+ def to_s
32
+ "== HASHRULES ==" << @hashmatcher.to_s
33
+ end
34
+
35
+ class Processor
36
+ def initialize string, hashmatcher, opts
37
+ @string = string.dup
38
+ @hashmatcher = hashmatcher
39
+ @max_submatch_level = opts[:max_submatch_level] || 0
40
+ @limit = opts[:limit] || 1
41
+ @memory = []
42
+ end
43
+
44
+ def do
45
+ each_submatch_level do |submatch_level|
46
+ add_to_list(new_results = analyze(submatch_level))
47
+ break if reached_limit?
48
+ end
49
+
50
+ sort_by_coverage
51
+
52
+ list
53
+ end
54
+
55
+ private
56
+
57
+ def analyze submatch_level
58
+ limit = @limit <= 0 ? -1 : (results_count() - @limit).abs
59
+ opts = {
60
+ skip_levels: submatch_level,
61
+ limit: limit
62
+ }
63
+
64
+ matches = @hashmatcher.analyze(@string, opts)
65
+ matches = matches.delete_if{|m| m['data'].empty?}
66
+
67
+ matches.each do |m|
68
+ coverage = Array.new(@string.length, false)
69
+ m['coverage'].each do |start, stop|
70
+ (start..stop).each do |i|
71
+ coverage[i] = true
72
+ end
73
+ end
74
+ (1..(coverage.count-2)).each do |i|
75
+ if coverage[i-1] && coverage[i+1]
76
+ coverage[i] = true
77
+ end
78
+ end
79
+
80
+ m['percent_coverage'] = coverage.find_all{|c| c}.count * 100 / @string.length
81
+ end
82
+
83
+ matches
84
+ end
85
+
86
+ def each_submatch_level &block
87
+ (0..@max_submatch_level).each &block
88
+ end
89
+
90
+ def reached_limit?
91
+ @limit > 0 && @memory.count >= @limit
92
+ end
93
+
94
+ def results_count
95
+ @memory.count
96
+ end
97
+
98
+ def add_to_list results
99
+ @memory += results
100
+ # results.each do |result|
101
+ # @memory << result unless @memory.any?{|m| m[:match_id] == result[:match_id]}
102
+ # end
103
+ end
104
+
105
+ def sort_by_coverage
106
+ @memory.sort! do |a,b|
107
+ result = b['coverage'].count <=> a['coverage'].count
108
+ if result == 0
109
+ result = b['percent_coverage'] <=> a['percent_coverage']
110
+
111
+ if result == 0
112
+ result = a['matchlevel'] <=> b['matchlevel']
113
+ end
114
+ end
115
+ result
116
+ end
117
+ end
118
+
119
+ def list
120
+ @memory
121
+ end
122
+
123
+ end
124
+ end
@@ -0,0 +1,3 @@
1
+ match 'oregon' do
2
+ set region: 'Oregon'
3
+ end
@@ -0,0 +1,4 @@
1
+
2
+ match /ми 8т/ do
3
+ set manufacturer: 'Ми-8Т'
4
+ end
@@ -0,0 +1,8 @@
1
+
2
+ match "match" do
3
+ set match: 'first'
4
+ end
5
+
6
+ match "match" do
7
+ set match: 'second'
8
+ end
@@ -0,0 +1,14 @@
1
+
2
+ match both(/per/, no(/piper/)) do
3
+ set manufacturer: 'Per'
4
+ end
5
+
6
+ match both('string', /regex/) do
7
+ set manufacturer: 'success'
8
+ end
9
+
10
+ match /piper/ do
11
+ set manufacturer: 'Piper'
12
+
13
+ include_subfolder 'piper'
14
+ end
@@ -0,0 +1,19 @@
1
+
2
+ match w(/pa ?28/) do # => /( |^)pa ?28( |$)/
3
+ set family: 'PA-28 Cherokee'
4
+ set engine_count: 1
5
+ set category: 'piston'
6
+
7
+ match 'pa28 181', 'pa 28 181' do
8
+ set model: 'PA-28-181'
9
+ set horsepower: 180
10
+
11
+ match /archer ii/, /archer 2/, /ii/ do
12
+ set model: 'PA-28-181 Archer II'
13
+ end
14
+
15
+ match /archer iii/, /archer 3/, /iii/ do
16
+ set model: 'PA-28-181 Archer III'
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,4 @@
1
+ match 'ohio' do
2
+ set region: 'Ohio'
3
+ include_subfolder('ohio')
4
+ end
@@ -0,0 +1,7 @@
1
+ match 'oregon', 'or' do
2
+ set city: 'Oregon'
3
+
4
+ match 'pearson' do
5
+ set place: 'pearson park'
6
+ end
7
+ end
@@ -0,0 +1,10 @@
1
+
2
+ match 'canada' do
3
+ set country: 'Canada'
4
+ include_subfolder('canada')
5
+ end
6
+
7
+ match w(/u ?s ?a?/), 'united states' do
8
+ set country: 'United States'
9
+ include_subfolder('united states')
10
+ end
@@ -0,0 +1,178 @@
1
+ # encoding: utf-8
2
+
3
+ require 'test_helper'
4
+ require 'hash_rules'
5
+
6
+ class HashRulesTest < TestCase
7
+ context 'hashrules, normal operation' do
8
+
9
+ setup do
10
+ @it = HashRules.new(folder: 'test/examples')
11
+ end
12
+
13
+ should 'return empty array when no match' do
14
+ assert_equal [], @it.process('curry')
15
+ end
16
+
17
+ should 'identify manufacturer' do
18
+ data = @it.process("1986 piper tjobahobo").first['data']
19
+ assert_equal 'Piper', data['manufacturer']
20
+ end
21
+
22
+ should 'identify perfect match' do
23
+ data = @it.process("piper pa28 181 ii").first['data']
24
+ assert_equal 'PA-28-181 Archer II', data['model']
25
+ end
26
+
27
+ should 'identify model' do
28
+ data = @it.process("2001 piper pa28 181 ii").first['data']
29
+ assert_equal 'PA-28-181 Archer II', data['model']
30
+ end
31
+
32
+ should 'allow numbers or letter next to match' do
33
+ data = @it.process("apiperloon").first['data']
34
+ assert_equal "Piper", data['manufacturer']
35
+ end
36
+
37
+ should 'match on string' do
38
+ data = @it.process("piper pa 28 181").first['data']
39
+ assert_equal "PA-28-181", data['model']
40
+ end
41
+
42
+ should 'discard double whitespace' do
43
+ data = @it.process("piper \t\r \npa 28\n 181").first['data']
44
+ assert_equal "PA-28-181", data['model']
45
+ end
46
+
47
+ should 'use "word" to make regexes match whole words' do
48
+ data = @it.process("piper apa-280").first['data']
49
+ assert_equal nil, data['family']
50
+ end
51
+
52
+ should 'present us with matched slices and percentage covered' do
53
+ data = @it.process("i'd like a piper in the pa28 family")
54
+ first = data.first
55
+
56
+ assert_equal 'Piper', first['data']['manufacturer']
57
+ assert_equal [[24, 27], [11, 15]], first['coverage']
58
+ assert_equal 25, first['percent_coverage']
59
+ end
60
+
61
+ should 'include spaces between matches in percentage' do
62
+ data = @it.process("oregon canada")
63
+ first = data.first
64
+
65
+ assert_equal 100, first['percent_coverage']
66
+ end
67
+
68
+ context 'both and no' do
69
+ should 'make operation trees' do
70
+ result = @it.process('person')
71
+ assert_equal 'Per', result.first['data']['manufacturer']
72
+ end
73
+
74
+ should 'use both with strings and regexes' do
75
+ data = @it.process('string rregexx').first['data']
76
+ assert_equal 'success', data['manufacturer']
77
+ end
78
+ end
79
+
80
+
81
+ should 'not allow numbers or letter next to STRING match' do
82
+ data = @it.process("piper apa-28 18100").first['data']
83
+ assert_equal nil, data['model']
84
+ end
85
+
86
+ should 'not allow several matches in the same context' do
87
+ # note: the rules are written in regex /ii/ which means it will also match /iii/
88
+ data = @it.process('piper pa28 181 archer iii', limit: 1).first['data']
89
+ assert_equal 'PA-28-181 Archer II', data['model']
90
+ end
91
+
92
+ context "case insensitive" do
93
+ should 'match cyrillic letters' do
94
+ data = @it.process("ми 8т").first['data']
95
+ assert_equal "Ми-8Т", data['manufacturer']
96
+ end
97
+ end
98
+ end
99
+
100
+ context 'multimatch' do
101
+
102
+ setup do
103
+ @it = HashRules.new(folder: 'test/examples')
104
+ end
105
+
106
+ should 'match several if indicated' do
107
+ r = @it.process('this should return two match', limit: -1)
108
+ assert_equal 2, r.count
109
+ first, second = r
110
+
111
+ assert_equal 'first', first['data']['match']
112
+ assert_equal 'second', second['data']['match']
113
+ end
114
+ end
115
+
116
+ context 'hashrules, submatch' do
117
+
118
+ setup do
119
+ @it = HashRules.new(folder: 'test/examples')
120
+ end
121
+
122
+ should 'match with adequate information, just as without submatch' do
123
+ r = @it.process('oregon ohio united states').first['data']
124
+ assert_equal 'United States', r['country']
125
+ assert_equal 'Ohio', r['region']
126
+ assert_equal 'Oregon', r['city']
127
+
128
+ r = @it.process('canada oregon').first['data']
129
+ assert_equal 'Canada', r['country']
130
+ assert_equal 'Oregon', r['region']
131
+ assert_equal nil, r['city']
132
+ end
133
+
134
+ should 'prefer to match on 2:nd level over 3:rd level' do
135
+ r = @it.process('oregon', max_submatch_level: 1, limit: 1).first['data']
136
+
137
+ # united states have a city called 'oregon', but which is on 3rd level,
138
+ # Oregon in canada is a state, and on the 2nd level
139
+ assert_equal 'Canada', r['country']
140
+ end
141
+
142
+ should 'prefer several multimatch over a single match and many characters' do
143
+
144
+ # there is no such place as 'or us' in canada, however there is or in united states
145
+ r = @it.process('canada or us', max_submatch_level: 6, limit: -1)
146
+
147
+ assert_equal 3, r.count
148
+ assert_equal 'United States', r[0]['data']['country']
149
+ end
150
+
151
+ should 'strive for 100% matach if possible' do
152
+ r = @it.process('oregon ohio', max_submatch_level: 5, limit: -1)
153
+
154
+ assert_equal 3, r.count
155
+
156
+ # Without intelligence, Oregon (canada) would be chosen because it's a state on the 2nd level, and Oregon in US is a city on 3rd level. However, we reason that if a 3rd level match can explain more about a string then it is reasonably more likely to be accurate.
157
+
158
+ assert_equal 'Ohio', r[0]['data']['region']
159
+ end
160
+
161
+ should 'backtrack coverage when found submatch' do
162
+ r = @it.process('pearson ohio', max_submatch_level: 6, limit: -1)
163
+
164
+ assert_equal [[0,6],[8,11]], r[0]['coverage']
165
+ end
166
+
167
+ should 'include matchlevel' do
168
+ r = @it.process('pearson ohio', max_submatch_level: 6, limit: -1)
169
+ assert_equal 2, r[0]['matchlevel']
170
+
171
+ r = @it.process('oregon', max_submatch_level: 6, limit: -1)
172
+ assert_equal 'Canada', r[0]['data']['country']
173
+ assert_equal 2, r[0]['matchlevel']
174
+ assert_equal 'United States', r[1]['data']['country']
175
+ assert_equal 3, r[1]['matchlevel']
176
+ end
177
+ end
178
+ end
@@ -0,0 +1,12 @@
1
+ require 'bundler/setup'
2
+ require 'test/unit'
3
+ require 'turn/autorun'
4
+ require 'shoulda'
5
+ require 'mocha'
6
+
7
+ #Turn.config.format = :dot
8
+
9
+ $LOAD_PATH << 'lib'
10
+
11
+ class TestCase < Test::Unit::TestCase
12
+ end
metadata ADDED
@@ -0,0 +1,72 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: hashrules
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.1.4
5
+ platform: ruby
6
+ authors:
7
+ - Mikael Wikman
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-07-04 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Rule-based hash manipulator using custom DSL
14
+ email:
15
+ - mikael@swedcontent.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - CHANGELOG.mdown
21
+ - Gemfile
22
+ - Gemfile.lock
23
+ - LICENSE.txt
24
+ - README.mdown
25
+ - Rakefile
26
+ - app.gemspec
27
+ - lib/hash_matcher.rb
28
+ - lib/hash_rules.rb
29
+ - test/examples/canada/oregon.rb
30
+ - test/examples/cyrillic.rb
31
+ - test/examples/match_two.rb
32
+ - test/examples/piper.rb
33
+ - test/examples/piper/pa28.rb
34
+ - test/examples/united states/ohio.rb
35
+ - test/examples/united states/ohio/oregon.rb
36
+ - test/examples/world.rb
37
+ - test/hash_rules_test.rb
38
+ - test/test_helper.rb
39
+ homepage: https://github.com/mikaelwikman/hashrules
40
+ licenses: []
41
+ metadata: {}
42
+ post_install_message:
43
+ rdoc_options: []
44
+ require_paths:
45
+ - lib
46
+ required_ruby_version: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - ! '>='
49
+ - !ruby/object:Gem::Version
50
+ version: '0'
51
+ required_rubygems_version: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ! '>='
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ requirements: []
57
+ rubyforge_project:
58
+ rubygems_version: 2.0.3
59
+ signing_key:
60
+ specification_version: 4
61
+ summary: ''
62
+ test_files:
63
+ - test/examples/canada/oregon.rb
64
+ - test/examples/cyrillic.rb
65
+ - test/examples/match_two.rb
66
+ - test/examples/piper.rb
67
+ - test/examples/piper/pa28.rb
68
+ - test/examples/united states/ohio.rb
69
+ - test/examples/united states/ohio/oregon.rb
70
+ - test/examples/world.rb
71
+ - test/hash_rules_test.rb
72
+ - test/test_helper.rb