hashrules 1.1.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ MDJhZmM2MWMyMTkzNjc3NjkwNjYzZmRlMjkwMjllNjFkZmJlNmRlYw==
5
+ data.tar.gz: !binary |-
6
+ YjZiYTk1NDBkY2I1MzgzNmY4YmI5NGRhYTkyMDkwOGU4MTY0ODY5ZQ==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ YTgyMzk0NzQ3MDdiOWNiM2JlMjM3ZmM4ZTIwYTc3YWI2NDliNWEzNzE2OGRh
10
+ MWIwNGM3ZGI0MWU5YzhhYmFiMDBkN2ZkYzE1ZDUxYzE4ODVjOGVmMzg1Zjdk
11
+ ZTQ5MDdhNTc1MWE3YzNiMDBhZjA0MWM1ZjRjMjdmYmFmM2YwNDg=
12
+ data.tar.gz: !binary |-
13
+ MzRiMGI0N2NmYTk5NDhkZWM5MDQxNDM3ODU4ZDM2Y2Q5YWQ5YmJiM2E2M2Vl
14
+ ZmQxMDQyYzZmZjk3ZDU3OTk3Yjc3Mzg5ZmMxZTVhOThkN2VhZWI4OWNhMTQz
15
+ M2U3NjNiNjRjMzgzMmEzZDNjZjYyMmZhMjM0NWM0N2Y3N2U2NWM=
data/CHANGELOG.mdown ADDED
@@ -0,0 +1,52 @@
1
+
2
+ # ChangeLog
3
+
4
+ ## 1.1.4
5
+
6
+ * Bumped version due to need to republish to RubyGems (last version did not include all files)
7
+
8
+ ## 1.1.3
9
+
10
+ * Improved cache hits
11
+
12
+ ## 1.1.2
13
+
14
+ * Sort results that can explain more matches above those with less, even if the others covers more characters
15
+
16
+ ## 1.1.1
17
+
18
+ * Percent coverage should include spaces between matches, otherwise a fully covered string would never reach 100%
19
+
20
+ ## 1.1.0
21
+
22
+ * Added result caching
23
+
24
+ ## 1.0.3
25
+
26
+ * Bugfix: Some cases would sort a low-level submatch over a higher one. Now matchlevel is set accordingly and sorted with if coverage are equal
27
+
28
+ ## 1.0.2
29
+
30
+ * Bugfix: Coverage was not set for parent matchers in the case of a successful submatch. This meant that sometimes a first-level match would be ranked lower (treated as a submatch) when another submatch was found.
31
+
32
+ ## 1.0.1
33
+
34
+ * Bugfix: When using smart search / submatching - don't count whitespace as coverage. This caused /germany/ in first level to gain less priority than /( |^)germany( |$)/ in USA (second level).
35
+ * Bugfix: Fix a certain case where a Limit set to 1 would gain -1 (unlimited) interpretation
36
+
37
+ ## 1.0.0
38
+
39
+ This is a major version which breaks backwards compatibility.
40
+
41
+ * Changed API: HashRules#process now takes a single string and returns a new hash
42
+ * Added submatch feature to find results (deeply) nested even if a top level couldn't match.
43
+ * HashRules no longer cleans string before process (such as removing - and /), this is now the responsibility of the application.
44
+
45
+ ## 0.2.3
46
+
47
+ * Fix issue with using both and strings (it would only accept no-class and regex before)
48
+
49
+ ## 0.2
50
+
51
+ * Add matcher keywords both() for applying AND operator and no() for negating an expression
52
+ * Change name to sc-hashrules
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gem 'turn'
4
+ gem 'shoulda'
5
+ gem 'mocha'
6
+ gem 'htmlentities'
7
+ gem 'rake'
8
+ gem 'awesome_print'
data/Gemfile.lock ADDED
@@ -0,0 +1,37 @@
1
+ GEM
2
+ remote: https://rubygems.org/
3
+ specs:
4
+ activesupport (3.2.9)
5
+ i18n (~> 0.6)
6
+ multi_json (~> 1.0)
7
+ ansi (1.4.3)
8
+ awesome_print (1.0.1)
9
+ bourne (1.1.2)
10
+ mocha (= 0.10.5)
11
+ htmlentities (4.3.1)
12
+ i18n (0.6.1)
13
+ metaclass (0.0.1)
14
+ mocha (0.10.5)
15
+ metaclass (~> 0.0.1)
16
+ multi_json (1.5.0)
17
+ rake (10.0.3)
18
+ shoulda (3.3.2)
19
+ shoulda-context (~> 1.0.1)
20
+ shoulda-matchers (~> 1.4.1)
21
+ shoulda-context (1.0.1)
22
+ shoulda-matchers (1.4.2)
23
+ activesupport (>= 3.0.0)
24
+ bourne (~> 1.1.2)
25
+ turn (0.9.6)
26
+ ansi
27
+
28
+ PLATFORMS
29
+ ruby
30
+
31
+ DEPENDENCIES
32
+ awesome_print
33
+ htmlentities
34
+ mocha
35
+ rake
36
+ shoulda
37
+ turn
data/LICENSE.txt ADDED
@@ -0,0 +1,4 @@
1
+ HashRules by Mikael Wikman is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.
2
+
3
+ For more information, please visit:
4
+ http://creativecommons.org/licenses/by-sa/3.0/
data/README.mdown ADDED
@@ -0,0 +1,143 @@
1
+
2
+ # HashRules
3
+
4
+ A simple, yet powerful, gem for building constructions for identifying contents in a string in a structured, hierarchical, manner.
5
+
6
+ ## Example
7
+
8
+ Suppose you're building a system for identifying countries, regions and cities within a string. An exerpt might look like this:
9
+
10
+ ```ruby
11
+ match 'united states', w(/u ?s ?a/) do
12
+ set country: 'United States'
13
+
14
+ match /( |^)cali/, 'ca' do
15
+ set state: 'California'
16
+ set region: 'West Coast'
17
+
18
+ match 'long beach', 'longbeach' do
19
+ set city: 'Long Beach'
20
+ end
21
+
22
+ match 'fontana' do
23
+ set city: 'Fontana'
24
+ end
25
+ end
26
+ end
27
+ ```
28
+
29
+ This simple DSL translates to a series of OR and AND statements. For example, if we were to write the exact same logic but in plain if/then/else and regexes; the path for matching all the way to 'Long Beach' would be:
30
+
31
+ ```ruby
32
+ text = "Long Beach, California, United States"
33
+ text.gsub!(',','')
34
+ result = {}
35
+
36
+ if (text =~ /( |^)united states( |$)/ || text =~ /( |^)u ?s ?a( |$)/)
37
+ result['country'] = 'United States'
38
+
39
+ if (text =~ /( |^)cali/ || text =~ /( |^)ca( |$)/)
40
+ result['state'] = 'California'
41
+ result['region'] = 'West Coast'
42
+
43
+ if (text =~ /( |^)long beach( |$)/ || text =~ /( |^)longbeach( |$)/)
44
+ result['city'] = 'Long Beach'
45
+ elsif text =~ /( |^)fontana( |$)/
46
+ result['city'] = 'Fontana'
47
+ end
48
+ end
49
+ end
50
+ ```
51
+
52
+ ## Available matchers
53
+ <table>
54
+ <tr>
55
+ <th>Notation</th><th>Equivalence</th><th>Use case</th>
56
+ </tr>
57
+ <tr>
58
+ <td>
59
+ w(regex)
60
+ </td>
61
+ <td>
62
+ /( |^)#{regex}( |$)/
63
+ </td>
64
+ <td>
65
+ Create regex matching only whole words
66
+ </td>
67
+ </tr>
68
+
69
+ <tr>
70
+ <td>
71
+ 'regex'
72
+ </td>
73
+ <td>
74
+ /( |^)#{Regexp.escape(regex)}( |$)/
75
+ </td>
76
+ <td>
77
+ String matching whole words. 'key' would match "I have a key" but not "monkey"
78
+ </td>
79
+ </tr>
80
+
81
+ <tr>
82
+ <td>
83
+ no(matcher)
84
+ </td>
85
+ <td>
86
+ !(matcher.success?)
87
+ </td>
88
+ <td>
89
+ Inverted string or regex matcher
90
+ </td>
91
+ </tr>
92
+
93
+ <tr>
94
+ <td>
95
+ both(A,B)
96
+ </td>
97
+ <td>
98
+ A.success? && B.success?
99
+ </td>
100
+ <td>
101
+ Logical AND of given matchers, which may be nested
102
+ </td>
103
+ </tr>
104
+ </table>
105
+
106
+ ## Installation
107
+
108
+ `gem install hashrules`
109
+
110
+ ## Usage
111
+
112
+ ```
113
+ require 'hash_rules'
114
+
115
+ rules = HashRules.new(args)
116
+
117
+ # args:
118
+ # * folder: path to the folder containing one or more rule files (all will be read)
119
+
120
+ process_args={}
121
+ results = rules.process('Alabama, Canada', process_args)
122
+
123
+ results.each do |result|
124
+ p result
125
+ end
126
+ ```
127
+
128
+ Each result contains the following keys:
129
+
130
+ * data: This is the resulting data accumulated from the rules `set` statements
131
+ * coverage: An array of index pairs showing what part of the string was matched.
132
+ * match_id: Indicates the deepest matching rule id
133
+ * matchlevel: Indicates how deeply nested the first match was. If `max_submatch_level` is zero, this will always be 1.
134
+ * percent_coverage: A number 0 to 100 indicating how many percent of the string was matched
135
+
136
+ Process_args may be any of the following:
137
+
138
+ * max_submatch_level: Allow search to start from a nested matcher. If this value is 1, this means HashRules may skip at the very most one level. Recommended to be used with limit: -1
139
+ * limit: Stop search when this number of matches have been found. Good for performance, but not recommended if `max_submatch_level` is more than zero because results won't guarantee best match.
140
+
141
+ ## License
142
+
143
+ <a rel="license" href="http://creativecommons.org/licenses/by-sa/3.0/deed.en_US"><img alt="Creative Commons License" style="border-width:0" src="http://i.creativecommons.org/l/by-sa/3.0/80x15.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" property="dct:title">HashRules</span> by <span xmlns:cc="http://creativecommons.org/ns#" property="cc:attributionName">Mikael Wikman</span> is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/3.0/deed.en_US">Creative Commons Attribution-ShareAlike 3.0 Unported License</a>.
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ require 'rake/testtask'
2
+
3
+ Rake::TestTask.new do |t|
4
+ t.libs << 'test'
5
+ t.test_files = FileList['test/**/*_test.rb']
6
+ t.verbose = true
7
+ end
8
+
9
+ task :default => :test
data/app.gemspec ADDED
@@ -0,0 +1,16 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |gem|
4
+ gem.authors = ['Mikael Wikman']
5
+ gem.email = ['mikael@swedcontent.com']
6
+ gem.description = %q{Rule-based hash manipulator using custom DSL}
7
+ gem.summary = %q{ }
8
+ gem.homepage = "https://github.com/mikaelwikman/hashrules"
9
+
10
+ gem.files = `git ls-files`.split("\n")
11
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
12
+ gem.test_files = gem.files.grep(%r{^(test|features)/})
13
+ gem.name = "hashrules"
14
+ gem.require_paths = ["lib"]
15
+ gem.version = '1.1.4'
16
+ end
@@ -0,0 +1,167 @@
1
+ class HashMatcher
2
+ attr_reader :rules, :sets
3
+
4
+ def initialize
5
+ @rules = []
6
+ @sets = {}
7
+ @context = self
8
+ end
9
+
10
+ def include_folder folder
11
+ Dir["#{folder}/*.rb"].each do |file|
12
+ @current_folder = folder
13
+ contents = File.read(file)
14
+ eval(contents, binding)
15
+ end
16
+ end
17
+
18
+ def include_subfolder folder
19
+ include_folder "#{@current_folder}/#{folder}"
20
+ end
21
+
22
+ def to_s i=0
23
+ result = ""
24
+ sets.each do |k,v|
25
+ result << "#{" "*i}#{k} = #{v}\n"
26
+ end
27
+ rules.each do |regexes, matcher|
28
+ result << "\n#{" "*i}If match #{regexes.to_s}\n"
29
+ result << matcher.to_s(i+1)
30
+ result << "#{" "*i}End\n"
31
+ end
32
+ result
33
+ end
34
+
35
+ def analyze string, opts={}, level=1
36
+ matches = []
37
+ opts[:limit] ||= 1
38
+ skip_levels = opts[:skip_levels] || 0
39
+
40
+ rules.each do |regexes, matcher|
41
+ offsets = []
42
+ if regexes.find{|r| offsets=test(string,r)} || skip_levels > 0
43
+ opts[:skip_levels] = skip_levels-1
44
+ sub_matches = matcher.analyze(string, opts, level+1)
45
+
46
+ sub_matches.map do |m|
47
+ m['data'] = sets.merge(m['data'])
48
+ if offsets
49
+ m['matchlevel'] = level
50
+ offsets.each do |offset|
51
+ start, stop = offset
52
+
53
+ stop -= 1
54
+
55
+ start += 1 if string[start] == ' '
56
+ stop -= 1 if string[stop] == ' '
57
+
58
+ m['coverage'] << [start,stop]
59
+ end
60
+ end
61
+ end
62
+
63
+ matches += sub_matches
64
+
65
+ if skip_levels <= 0
66
+ opts[:limit] -= 1
67
+ end
68
+
69
+ if (opts[:limit]) == 0
70
+ break
71
+ end
72
+ end
73
+ end
74
+ if matches.empty? && skip_levels < 0
75
+ matches << { 'data' => sets.dup, 'coverage' => [], match_id: self.object_id}
76
+ end
77
+
78
+ matches
79
+ end
80
+
81
+ private
82
+
83
+ def test string, matcher
84
+ if matcher.is_a?(NoClass)
85
+ m = test(string,matcher.regex)
86
+ [[-1,-1]] if !m
87
+ elsif matcher.is_a?(AndClass)
88
+ r = matcher.regexes.map{|r| a=test(string,r); a[0] if a}
89
+ if r.all?{|r| r}
90
+ r.find_all{|r| r[0] != -1}
91
+ end
92
+ else
93
+ m = matcher.match(string)
94
+ [m.offset(0)] if m
95
+ end
96
+ end
97
+
98
+ def set sub_hash
99
+ @context.sets.merge! stringified sub_hash
100
+ end
101
+
102
+ def match *args, &block
103
+ regexes = args.map{|r| to_regex(r)}
104
+ matcher = HashMatcher.new
105
+ old_context = @context
106
+ old_folder = @current_folder
107
+ @context.rules << [regexes, matcher]
108
+ @context = matcher
109
+ block.call
110
+ @context = old_context
111
+ @current_folder = old_folder
112
+ end
113
+
114
+ def w(regex) # make it match whole words
115
+ /(^| )#{regex.source}($| )/
116
+ end
117
+
118
+ def no(regex)
119
+ NoClass.new(to_regex(regex))
120
+ end
121
+
122
+ def both(*regexes)
123
+ AndClass.new(to_regex(regexes))
124
+ end
125
+
126
+ def to_regex(matcher)
127
+ if matcher.kind_of?(String)
128
+ /(^| )#{Regexp.escape(matcher)}($| )/
129
+ elsif matcher.kind_of?(Array)
130
+ matcher.map{|r| to_regex(r)}
131
+ else
132
+ matcher
133
+ end
134
+ end
135
+
136
+ def stringified hash
137
+ hash.keys.each do |key|
138
+ val = hash.delete(key)
139
+ hash[key.to_s] = val
140
+ end
141
+ hash
142
+ end
143
+
144
+ class NoClass
145
+ attr_reader :regex
146
+
147
+ def initialize regex
148
+ @regex = regex
149
+ end
150
+
151
+ def to_s
152
+ "!(#{@regex})"
153
+ end
154
+ end
155
+
156
+ class AndClass
157
+ attr_reader :regexes
158
+
159
+ def initialize regexes
160
+ @regexes = regexes
161
+ end
162
+
163
+ def to_s
164
+ @regexes.map{|r| r.inspect}.join(' AND ')
165
+ end
166
+ end
167
+ end
data/lib/hash_rules.rb ADDED
@@ -0,0 +1,124 @@
1
+ require 'hash_matcher'
2
+ require 'json'
3
+
4
+ class HashRules
5
+
6
+ def initialize args
7
+ @folder = args[:folder] || raise("No folder specified!")
8
+
9
+ @hashmatcher = HashMatcher.new
10
+ @hashmatcher.include_folder(@folder)
11
+ @cache = {}
12
+ end
13
+
14
+ def process string, opts={}
15
+ string = clean_string(string)
16
+
17
+ if cached=@cache[string]
18
+ return Marshal.load(cached)
19
+ end
20
+
21
+ result = Processor.new(string, @hashmatcher, opts).do
22
+
23
+ @cache[string] = Marshal.dump(result)
24
+ result
25
+ end
26
+
27
+ def clean_string string
28
+ string.gsub /\s+/, ' '
29
+ end
30
+
31
+ def to_s
32
+ "== HASHRULES ==" << @hashmatcher.to_s
33
+ end
34
+
35
+ class Processor
36
+ def initialize string, hashmatcher, opts
37
+ @string = string.dup
38
+ @hashmatcher = hashmatcher
39
+ @max_submatch_level = opts[:max_submatch_level] || 0
40
+ @limit = opts[:limit] || 1
41
+ @memory = []
42
+ end
43
+
44
+ def do
45
+ each_submatch_level do |submatch_level|
46
+ add_to_list(new_results = analyze(submatch_level))
47
+ break if reached_limit?
48
+ end
49
+
50
+ sort_by_coverage
51
+
52
+ list
53
+ end
54
+
55
+ private
56
+
57
+ def analyze submatch_level
58
+ limit = @limit <= 0 ? -1 : (results_count() - @limit).abs
59
+ opts = {
60
+ skip_levels: submatch_level,
61
+ limit: limit
62
+ }
63
+
64
+ matches = @hashmatcher.analyze(@string, opts)
65
+ matches = matches.delete_if{|m| m['data'].empty?}
66
+
67
+ matches.each do |m|
68
+ coverage = Array.new(@string.length, false)
69
+ m['coverage'].each do |start, stop|
70
+ (start..stop).each do |i|
71
+ coverage[i] = true
72
+ end
73
+ end
74
+ (1..(coverage.count-2)).each do |i|
75
+ if coverage[i-1] && coverage[i+1]
76
+ coverage[i] = true
77
+ end
78
+ end
79
+
80
+ m['percent_coverage'] = coverage.find_all{|c| c}.count * 100 / @string.length
81
+ end
82
+
83
+ matches
84
+ end
85
+
86
+ def each_submatch_level &block
87
+ (0..@max_submatch_level).each &block
88
+ end
89
+
90
+ def reached_limit?
91
+ @limit > 0 && @memory.count >= @limit
92
+ end
93
+
94
+ def results_count
95
+ @memory.count
96
+ end
97
+
98
+ def add_to_list results
99
+ @memory += results
100
+ # results.each do |result|
101
+ # @memory << result unless @memory.any?{|m| m[:match_id] == result[:match_id]}
102
+ # end
103
+ end
104
+
105
+ def sort_by_coverage
106
+ @memory.sort! do |a,b|
107
+ result = b['coverage'].count <=> a['coverage'].count
108
+ if result == 0
109
+ result = b['percent_coverage'] <=> a['percent_coverage']
110
+
111
+ if result == 0
112
+ result = a['matchlevel'] <=> b['matchlevel']
113
+ end
114
+ end
115
+ result
116
+ end
117
+ end
118
+
119
+ def list
120
+ @memory
121
+ end
122
+
123
+ end
124
+ end
@@ -0,0 +1,3 @@
1
+ match 'oregon' do
2
+ set region: 'Oregon'
3
+ end
@@ -0,0 +1,4 @@
1
+
2
+ match /ми 8т/ do
3
+ set manufacturer: 'Ми-8Т'
4
+ end
@@ -0,0 +1,8 @@
1
+
2
+ match "match" do
3
+ set match: 'first'
4
+ end
5
+
6
+ match "match" do
7
+ set match: 'second'
8
+ end
@@ -0,0 +1,14 @@
1
+
2
+ match both(/per/, no(/piper/)) do
3
+ set manufacturer: 'Per'
4
+ end
5
+
6
+ match both('string', /regex/) do
7
+ set manufacturer: 'success'
8
+ end
9
+
10
+ match /piper/ do
11
+ set manufacturer: 'Piper'
12
+
13
+ include_subfolder 'piper'
14
+ end
@@ -0,0 +1,19 @@
1
+
2
+ match w(/pa ?28/) do # => /( |^)pa ?28( |$)/
3
+ set family: 'PA-28 Cherokee'
4
+ set engine_count: 1
5
+ set category: 'piston'
6
+
7
+ match 'pa28 181', 'pa 28 181' do
8
+ set model: 'PA-28-181'
9
+ set horsepower: 180
10
+
11
+ match /archer ii/, /archer 2/, /ii/ do
12
+ set model: 'PA-28-181 Archer II'
13
+ end
14
+
15
+ match /archer iii/, /archer 3/, /iii/ do
16
+ set model: 'PA-28-181 Archer III'
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,4 @@
1
+ match 'ohio' do
2
+ set region: 'Ohio'
3
+ include_subfolder('ohio')
4
+ end
@@ -0,0 +1,7 @@
1
+ match 'oregon', 'or' do
2
+ set city: 'Oregon'
3
+
4
+ match 'pearson' do
5
+ set place: 'pearson park'
6
+ end
7
+ end
@@ -0,0 +1,10 @@
1
+
2
+ match 'canada' do
3
+ set country: 'Canada'
4
+ include_subfolder('canada')
5
+ end
6
+
7
+ match w(/u ?s ?a?/), 'united states' do
8
+ set country: 'United States'
9
+ include_subfolder('united states')
10
+ end
@@ -0,0 +1,178 @@
1
+ # encoding: utf-8
2
+
3
+ require 'test_helper'
4
+ require 'hash_rules'
5
+
6
+ class HashRulesTest < TestCase
7
+ context 'hashrules, normal operation' do
8
+
9
+ setup do
10
+ @it = HashRules.new(folder: 'test/examples')
11
+ end
12
+
13
+ should 'return empty array when no match' do
14
+ assert_equal [], @it.process('curry')
15
+ end
16
+
17
+ should 'identify manufacturer' do
18
+ data = @it.process("1986 piper tjobahobo").first['data']
19
+ assert_equal 'Piper', data['manufacturer']
20
+ end
21
+
22
+ should 'identify perfect match' do
23
+ data = @it.process("piper pa28 181 ii").first['data']
24
+ assert_equal 'PA-28-181 Archer II', data['model']
25
+ end
26
+
27
+ should 'identify model' do
28
+ data = @it.process("2001 piper pa28 181 ii").first['data']
29
+ assert_equal 'PA-28-181 Archer II', data['model']
30
+ end
31
+
32
+ should 'allow numbers or letter next to match' do
33
+ data = @it.process("apiperloon").first['data']
34
+ assert_equal "Piper", data['manufacturer']
35
+ end
36
+
37
+ should 'match on string' do
38
+ data = @it.process("piper pa 28 181").first['data']
39
+ assert_equal "PA-28-181", data['model']
40
+ end
41
+
42
+ should 'discard double whitespace' do
43
+ data = @it.process("piper \t\r \npa 28\n 181").first['data']
44
+ assert_equal "PA-28-181", data['model']
45
+ end
46
+
47
+ should 'use "word" to make regexes match whole words' do
48
+ data = @it.process("piper apa-280").first['data']
49
+ assert_equal nil, data['family']
50
+ end
51
+
52
+ should 'present us with matched slices and percentage covered' do
53
+ data = @it.process("i'd like a piper in the pa28 family")
54
+ first = data.first
55
+
56
+ assert_equal 'Piper', first['data']['manufacturer']
57
+ assert_equal [[24, 27], [11, 15]], first['coverage']
58
+ assert_equal 25, first['percent_coverage']
59
+ end
60
+
61
+ should 'include spaces between matches in percentage' do
62
+ data = @it.process("oregon canada")
63
+ first = data.first
64
+
65
+ assert_equal 100, first['percent_coverage']
66
+ end
67
+
68
+ context 'both and no' do
69
+ should 'make operation trees' do
70
+ result = @it.process('person')
71
+ assert_equal 'Per', result.first['data']['manufacturer']
72
+ end
73
+
74
+ should 'use both with strings and regexes' do
75
+ data = @it.process('string rregexx').first['data']
76
+ assert_equal 'success', data['manufacturer']
77
+ end
78
+ end
79
+
80
+
81
+ should 'not allow numbers or letter next to STRING match' do
82
+ data = @it.process("piper apa-28 18100").first['data']
83
+ assert_equal nil, data['model']
84
+ end
85
+
86
+ should 'not allow several matches in the same context' do
87
+ # note: the rules are written in regex /ii/ which means it will also match /iii/
88
+ data = @it.process('piper pa28 181 archer iii', limit: 1).first['data']
89
+ assert_equal 'PA-28-181 Archer II', data['model']
90
+ end
91
+
92
+ context "case insensitive" do
93
+ should 'match cyrillic letters' do
94
+ data = @it.process("ми 8т").first['data']
95
+ assert_equal "Ми-8Т", data['manufacturer']
96
+ end
97
+ end
98
+ end
99
+
100
+ context 'multimatch' do
101
+
102
+ setup do
103
+ @it = HashRules.new(folder: 'test/examples')
104
+ end
105
+
106
+ should 'match several if indicated' do
107
+ r = @it.process('this should return two match', limit: -1)
108
+ assert_equal 2, r.count
109
+ first, second = r
110
+
111
+ assert_equal 'first', first['data']['match']
112
+ assert_equal 'second', second['data']['match']
113
+ end
114
+ end
115
+
116
+ context 'hashrules, submatch' do
117
+
118
+ setup do
119
+ @it = HashRules.new(folder: 'test/examples')
120
+ end
121
+
122
+ should 'match with adequate information, just as without submatch' do
123
+ r = @it.process('oregon ohio united states').first['data']
124
+ assert_equal 'United States', r['country']
125
+ assert_equal 'Ohio', r['region']
126
+ assert_equal 'Oregon', r['city']
127
+
128
+ r = @it.process('canada oregon').first['data']
129
+ assert_equal 'Canada', r['country']
130
+ assert_equal 'Oregon', r['region']
131
+ assert_equal nil, r['city']
132
+ end
133
+
134
+ should 'prefer to match on 2:nd level over 3:rd level' do
135
+ r = @it.process('oregon', max_submatch_level: 1, limit: 1).first['data']
136
+
137
+ # united states have a city called 'oregon', but which is on 3rd level,
138
+ # Oregon in canada is a state, and on the 2nd level
139
+ assert_equal 'Canada', r['country']
140
+ end
141
+
142
+ should 'prefer several multimatch over a single match and many characters' do
143
+
144
+ # there is no such place as 'or us' in canada, however there is or in united states
145
+ r = @it.process('canada or us', max_submatch_level: 6, limit: -1)
146
+
147
+ assert_equal 3, r.count
148
+ assert_equal 'United States', r[0]['data']['country']
149
+ end
150
+
151
+ should 'strive for 100% matach if possible' do
152
+ r = @it.process('oregon ohio', max_submatch_level: 5, limit: -1)
153
+
154
+ assert_equal 3, r.count
155
+
156
+ # Without intelligence, Oregon (canada) would be chosen because it's a state on the 2nd level, and Oregon in US is a city on 3rd level. However, we reason that if a 3rd level match can explain more about a string then it is reasonably more likely to be accurate.
157
+
158
+ assert_equal 'Ohio', r[0]['data']['region']
159
+ end
160
+
161
+ should 'backtrack coverage when found submatch' do
162
+ r = @it.process('pearson ohio', max_submatch_level: 6, limit: -1)
163
+
164
+ assert_equal [[0,6],[8,11]], r[0]['coverage']
165
+ end
166
+
167
+ should 'include matchlevel' do
168
+ r = @it.process('pearson ohio', max_submatch_level: 6, limit: -1)
169
+ assert_equal 2, r[0]['matchlevel']
170
+
171
+ r = @it.process('oregon', max_submatch_level: 6, limit: -1)
172
+ assert_equal 'Canada', r[0]['data']['country']
173
+ assert_equal 2, r[0]['matchlevel']
174
+ assert_equal 'United States', r[1]['data']['country']
175
+ assert_equal 3, r[1]['matchlevel']
176
+ end
177
+ end
178
+ end
@@ -0,0 +1,12 @@
1
+ require 'bundler/setup'
2
+ require 'test/unit'
3
+ require 'turn/autorun'
4
+ require 'shoulda'
5
+ require 'mocha'
6
+
7
+ #Turn.config.format = :dot
8
+
9
+ $LOAD_PATH << 'lib'
10
+
11
+ class TestCase < Test::Unit::TestCase
12
+ end
metadata ADDED
@@ -0,0 +1,72 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: hashrules
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.1.4
5
+ platform: ruby
6
+ authors:
7
+ - Mikael Wikman
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-07-04 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Rule-based hash manipulator using custom DSL
14
+ email:
15
+ - mikael@swedcontent.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - CHANGELOG.mdown
21
+ - Gemfile
22
+ - Gemfile.lock
23
+ - LICENSE.txt
24
+ - README.mdown
25
+ - Rakefile
26
+ - app.gemspec
27
+ - lib/hash_matcher.rb
28
+ - lib/hash_rules.rb
29
+ - test/examples/canada/oregon.rb
30
+ - test/examples/cyrillic.rb
31
+ - test/examples/match_two.rb
32
+ - test/examples/piper.rb
33
+ - test/examples/piper/pa28.rb
34
+ - test/examples/united states/ohio.rb
35
+ - test/examples/united states/ohio/oregon.rb
36
+ - test/examples/world.rb
37
+ - test/hash_rules_test.rb
38
+ - test/test_helper.rb
39
+ homepage: https://github.com/mikaelwikman/hashrules
40
+ licenses: []
41
+ metadata: {}
42
+ post_install_message:
43
+ rdoc_options: []
44
+ require_paths:
45
+ - lib
46
+ required_ruby_version: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - ! '>='
49
+ - !ruby/object:Gem::Version
50
+ version: '0'
51
+ required_rubygems_version: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ! '>='
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ requirements: []
57
+ rubyforge_project:
58
+ rubygems_version: 2.0.3
59
+ signing_key:
60
+ specification_version: 4
61
+ summary: ''
62
+ test_files:
63
+ - test/examples/canada/oregon.rb
64
+ - test/examples/cyrillic.rb
65
+ - test/examples/match_two.rb
66
+ - test/examples/piper.rb
67
+ - test/examples/piper/pa28.rb
68
+ - test/examples/united states/ohio.rb
69
+ - test/examples/united states/ohio/oregon.rb
70
+ - test/examples/world.rb
71
+ - test/hash_rules_test.rb
72
+ - test/test_helper.rb