hashrules 1.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/CHANGELOG.mdown +52 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +37 -0
- data/LICENSE.txt +4 -0
- data/README.mdown +143 -0
- data/Rakefile +9 -0
- data/app.gemspec +16 -0
- data/lib/hash_matcher.rb +167 -0
- data/lib/hash_rules.rb +124 -0
- data/test/examples/canada/oregon.rb +3 -0
- data/test/examples/cyrillic.rb +4 -0
- data/test/examples/match_two.rb +8 -0
- data/test/examples/piper.rb +14 -0
- data/test/examples/piper/pa28.rb +19 -0
- data/test/examples/united states/ohio.rb +4 -0
- data/test/examples/united states/ohio/oregon.rb +7 -0
- data/test/examples/world.rb +10 -0
- data/test/hash_rules_test.rb +178 -0
- data/test/test_helper.rb +12 -0
- metadata +72 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
MDJhZmM2MWMyMTkzNjc3NjkwNjYzZmRlMjkwMjllNjFkZmJlNmRlYw==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
YjZiYTk1NDBkY2I1MzgzNmY4YmI5NGRhYTkyMDkwOGU4MTY0ODY5ZQ==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
YTgyMzk0NzQ3MDdiOWNiM2JlMjM3ZmM4ZTIwYTc3YWI2NDliNWEzNzE2OGRh
|
10
|
+
MWIwNGM3ZGI0MWU5YzhhYmFiMDBkN2ZkYzE1ZDUxYzE4ODVjOGVmMzg1Zjdk
|
11
|
+
ZTQ5MDdhNTc1MWE3YzNiMDBhZjA0MWM1ZjRjMjdmYmFmM2YwNDg=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
MzRiMGI0N2NmYTk5NDhkZWM5MDQxNDM3ODU4ZDM2Y2Q5YWQ5YmJiM2E2M2Vl
|
14
|
+
ZmQxMDQyYzZmZjk3ZDU3OTk3Yjc3Mzg5ZmMxZTVhOThkN2VhZWI4OWNhMTQz
|
15
|
+
M2U3NjNiNjRjMzgzMmEzZDNjZjYyMmZhMjM0NWM0N2Y3N2U2NWM=
|
data/CHANGELOG.mdown
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
|
2
|
+
# ChangeLog
|
3
|
+
|
4
|
+
## 1.1.4
|
5
|
+
|
6
|
+
* Bumped version due to need to republish to RubyGems (last version did not include all files)
|
7
|
+
|
8
|
+
## 1.1.3
|
9
|
+
|
10
|
+
* Improved cache hits
|
11
|
+
|
12
|
+
## 1.1.2
|
13
|
+
|
14
|
+
* Sort results that can explain more matches above those with less, even if the others covers more characters
|
15
|
+
|
16
|
+
## 1.1.1
|
17
|
+
|
18
|
+
* Percent coverage should include spaces between matches, otherwise a fully covered string would never reach 100%
|
19
|
+
|
20
|
+
## 1.1.0
|
21
|
+
|
22
|
+
* Added result caching
|
23
|
+
|
24
|
+
## 1.0.3
|
25
|
+
|
26
|
+
* Bugfix: Some cases would sort a low-level submatch over a higher one. Now matchlevel is set accordingly and sorted with if coverage are equal
|
27
|
+
|
28
|
+
## 1.0.2
|
29
|
+
|
30
|
+
* Bugfix: Coverage was not set for parent matchers in the case of a successful submatch. This meant that sometimes a first-level match would be ranked lower (treated as a submatch) when another submatch was found.
|
31
|
+
|
32
|
+
## 1.0.1
|
33
|
+
|
34
|
+
* Bugfix: When using smart search / submatching - don't count whitespace as coverage. This caused /germany/ in first level to gain less priority than /( |^)germany( |$)/ in USA (second level).
|
35
|
+
* Bugfix: Fix a certain case where a Limit set to 1 would gain -1 (unlimited) interpretation
|
36
|
+
|
37
|
+
## 1.0.0
|
38
|
+
|
39
|
+
This is a major version which breaks backwards compatibility.
|
40
|
+
|
41
|
+
* Changed API: HashRules#process now takes a single string and returns a new hash
|
42
|
+
* Added submatch feature to find results (deeply) nested even if a top level couldn't match.
|
43
|
+
* HashRules no longer cleans string before process (such as removing - and /), this is now the responsibility of the application.
|
44
|
+
|
45
|
+
## 0.2.3
|
46
|
+
|
47
|
+
* Fix issue with using both and strings (it would only accept no-class and regex before)
|
48
|
+
|
49
|
+
## 0.2
|
50
|
+
|
51
|
+
* Add matcher keywords both() for applying AND operator and no() for negating an expression
|
52
|
+
* Change name to sc-hashrules
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
GEM
|
2
|
+
remote: https://rubygems.org/
|
3
|
+
specs:
|
4
|
+
activesupport (3.2.9)
|
5
|
+
i18n (~> 0.6)
|
6
|
+
multi_json (~> 1.0)
|
7
|
+
ansi (1.4.3)
|
8
|
+
awesome_print (1.0.1)
|
9
|
+
bourne (1.1.2)
|
10
|
+
mocha (= 0.10.5)
|
11
|
+
htmlentities (4.3.1)
|
12
|
+
i18n (0.6.1)
|
13
|
+
metaclass (0.0.1)
|
14
|
+
mocha (0.10.5)
|
15
|
+
metaclass (~> 0.0.1)
|
16
|
+
multi_json (1.5.0)
|
17
|
+
rake (10.0.3)
|
18
|
+
shoulda (3.3.2)
|
19
|
+
shoulda-context (~> 1.0.1)
|
20
|
+
shoulda-matchers (~> 1.4.1)
|
21
|
+
shoulda-context (1.0.1)
|
22
|
+
shoulda-matchers (1.4.2)
|
23
|
+
activesupport (>= 3.0.0)
|
24
|
+
bourne (~> 1.1.2)
|
25
|
+
turn (0.9.6)
|
26
|
+
ansi
|
27
|
+
|
28
|
+
PLATFORMS
|
29
|
+
ruby
|
30
|
+
|
31
|
+
DEPENDENCIES
|
32
|
+
awesome_print
|
33
|
+
htmlentities
|
34
|
+
mocha
|
35
|
+
rake
|
36
|
+
shoulda
|
37
|
+
turn
|
data/LICENSE.txt
ADDED
data/README.mdown
ADDED
@@ -0,0 +1,143 @@
|
|
1
|
+
|
2
|
+
# HashRules
|
3
|
+
|
4
|
+
A simple, yet powerful, gem for building constructions for identifying contents in a string in a structured, hierarchical, manner.
|
5
|
+
|
6
|
+
## Example
|
7
|
+
|
8
|
+
Suppose you're building a system for identifying countries, regions and cities within a string. An exerpt might look like this:
|
9
|
+
|
10
|
+
```ruby
|
11
|
+
match 'united states', w(/u ?s ?a/) do
|
12
|
+
set country: 'United States'
|
13
|
+
|
14
|
+
match /( |^)cali/, 'ca' do
|
15
|
+
set state: 'California'
|
16
|
+
set region: 'West Coast'
|
17
|
+
|
18
|
+
match 'long beach', 'longbeach' do
|
19
|
+
set city: 'Long Beach'
|
20
|
+
end
|
21
|
+
|
22
|
+
match 'fontana' do
|
23
|
+
set city: 'Fontana'
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
```
|
28
|
+
|
29
|
+
This simple DSL translates to a series of OR and AND statements. For example, if we were to write the exact same logic but in plain if/then/else and regexes; the path for matching all the way to 'Long Beach' would be:
|
30
|
+
|
31
|
+
```ruby
|
32
|
+
text = "Long Beach, California, United States"
|
33
|
+
text.gsub!(',','')
|
34
|
+
result = {}
|
35
|
+
|
36
|
+
if (text =~ /( |^)united states( |$)/ || text =~ /( |^)u ?s ?a( |$)/)
|
37
|
+
result['country'] = 'United States'
|
38
|
+
|
39
|
+
if (text =~ /( |^)cali/ || text =~ /( |^)ca( |$)/)
|
40
|
+
result['state'] = 'California'
|
41
|
+
result['region'] = 'West Coast'
|
42
|
+
|
43
|
+
if (text =~ /( |^)long beach( |$)/ || text =~ /( |^)longbeach( |$)/)
|
44
|
+
result['city'] = 'Long Beach'
|
45
|
+
elsif text =~ /( |^)fontana( |$)/
|
46
|
+
result['city'] = 'Fontana'
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
```
|
51
|
+
|
52
|
+
## Available matchers
|
53
|
+
<table>
|
54
|
+
<tr>
|
55
|
+
<th>Notation</th><th>Equivalence</th><th>Use case</th>
|
56
|
+
</tr>
|
57
|
+
<tr>
|
58
|
+
<td>
|
59
|
+
w(regex)
|
60
|
+
</td>
|
61
|
+
<td>
|
62
|
+
/( |^)#{regex}( |$)/
|
63
|
+
</td>
|
64
|
+
<td>
|
65
|
+
Create regex matching only whole words
|
66
|
+
</td>
|
67
|
+
</tr>
|
68
|
+
|
69
|
+
<tr>
|
70
|
+
<td>
|
71
|
+
'regex'
|
72
|
+
</td>
|
73
|
+
<td>
|
74
|
+
/( |^)#{Regexp.escape(regex)}( |$)/
|
75
|
+
</td>
|
76
|
+
<td>
|
77
|
+
String matching whole words. 'key' would match "I have a key" but not "monkey"
|
78
|
+
</td>
|
79
|
+
</tr>
|
80
|
+
|
81
|
+
<tr>
|
82
|
+
<td>
|
83
|
+
no(matcher)
|
84
|
+
</td>
|
85
|
+
<td>
|
86
|
+
!(matcher.success?)
|
87
|
+
</td>
|
88
|
+
<td>
|
89
|
+
Inverted string or regex matcher
|
90
|
+
</td>
|
91
|
+
</tr>
|
92
|
+
|
93
|
+
<tr>
|
94
|
+
<td>
|
95
|
+
both(A,B)
|
96
|
+
</td>
|
97
|
+
<td>
|
98
|
+
A.success? && B.success?
|
99
|
+
</td>
|
100
|
+
<td>
|
101
|
+
Logical AND of given matchers, which may be nested
|
102
|
+
</td>
|
103
|
+
</tr>
|
104
|
+
</table>
|
105
|
+
|
106
|
+
## Installation
|
107
|
+
|
108
|
+
`gem install hashrules`
|
109
|
+
|
110
|
+
## Usage
|
111
|
+
|
112
|
+
```
|
113
|
+
require 'hash_rules'
|
114
|
+
|
115
|
+
rules = HashRules.new(args)
|
116
|
+
|
117
|
+
# args:
|
118
|
+
# * folder: path to the folder containing one or more rule files (all will be read)
|
119
|
+
|
120
|
+
process_args={}
|
121
|
+
results = rules.process('Alabama, Canada', process_args)
|
122
|
+
|
123
|
+
results.each do |result|
|
124
|
+
p result
|
125
|
+
end
|
126
|
+
```
|
127
|
+
|
128
|
+
Each result contains the following keys:
|
129
|
+
|
130
|
+
* data: This is the resulting data accumulated from the rules `set` statements
|
131
|
+
* coverage: An array of index pairs showing what part of the string was matched.
|
132
|
+
* match_id: Indicates the deepest matching rule id
|
133
|
+
* matchlevel: Indicates how deeply nested the first match was. If `max_submatch_level` is zero, this will always be 1.
|
134
|
+
* percent_coverage: A number 0 to 100 indicating how many percent of the string was matched
|
135
|
+
|
136
|
+
Process_args may be any of the following:
|
137
|
+
|
138
|
+
* max_submatch_level: Allow search to start from a nested matcher. If this value is 1, this means HashRules may skip at the very most one level. Recommended to be used with limit: -1
|
139
|
+
* limit: Stop search when this number of matches have been found. Good for performance, but not recommended if `max_submatch_level` is more than zero because results won't guarantee best match.
|
140
|
+
|
141
|
+
## License
|
142
|
+
|
143
|
+
<a rel="license" href="http://creativecommons.org/licenses/by-sa/3.0/deed.en_US"><img alt="Creative Commons License" style="border-width:0" src="http://i.creativecommons.org/l/by-sa/3.0/80x15.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" property="dct:title">HashRules</span> by <span xmlns:cc="http://creativecommons.org/ns#" property="cc:attributionName">Mikael Wikman</span> is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/3.0/deed.en_US">Creative Commons Attribution-ShareAlike 3.0 Unported License</a>.
|
data/Rakefile
ADDED
data/app.gemspec
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |gem|
|
4
|
+
gem.authors = ['Mikael Wikman']
|
5
|
+
gem.email = ['mikael@swedcontent.com']
|
6
|
+
gem.description = %q{Rule-based hash manipulator using custom DSL}
|
7
|
+
gem.summary = %q{ }
|
8
|
+
gem.homepage = "https://github.com/mikaelwikman/hashrules"
|
9
|
+
|
10
|
+
gem.files = `git ls-files`.split("\n")
|
11
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
12
|
+
gem.test_files = gem.files.grep(%r{^(test|features)/})
|
13
|
+
gem.name = "hashrules"
|
14
|
+
gem.require_paths = ["lib"]
|
15
|
+
gem.version = '1.1.4'
|
16
|
+
end
|
data/lib/hash_matcher.rb
ADDED
@@ -0,0 +1,167 @@
|
|
1
|
+
class HashMatcher
|
2
|
+
attr_reader :rules, :sets
|
3
|
+
|
4
|
+
def initialize
|
5
|
+
@rules = []
|
6
|
+
@sets = {}
|
7
|
+
@context = self
|
8
|
+
end
|
9
|
+
|
10
|
+
def include_folder folder
|
11
|
+
Dir["#{folder}/*.rb"].each do |file|
|
12
|
+
@current_folder = folder
|
13
|
+
contents = File.read(file)
|
14
|
+
eval(contents, binding)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def include_subfolder folder
|
19
|
+
include_folder "#{@current_folder}/#{folder}"
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_s i=0
|
23
|
+
result = ""
|
24
|
+
sets.each do |k,v|
|
25
|
+
result << "#{" "*i}#{k} = #{v}\n"
|
26
|
+
end
|
27
|
+
rules.each do |regexes, matcher|
|
28
|
+
result << "\n#{" "*i}If match #{regexes.to_s}\n"
|
29
|
+
result << matcher.to_s(i+1)
|
30
|
+
result << "#{" "*i}End\n"
|
31
|
+
end
|
32
|
+
result
|
33
|
+
end
|
34
|
+
|
35
|
+
def analyze string, opts={}, level=1
|
36
|
+
matches = []
|
37
|
+
opts[:limit] ||= 1
|
38
|
+
skip_levels = opts[:skip_levels] || 0
|
39
|
+
|
40
|
+
rules.each do |regexes, matcher|
|
41
|
+
offsets = []
|
42
|
+
if regexes.find{|r| offsets=test(string,r)} || skip_levels > 0
|
43
|
+
opts[:skip_levels] = skip_levels-1
|
44
|
+
sub_matches = matcher.analyze(string, opts, level+1)
|
45
|
+
|
46
|
+
sub_matches.map do |m|
|
47
|
+
m['data'] = sets.merge(m['data'])
|
48
|
+
if offsets
|
49
|
+
m['matchlevel'] = level
|
50
|
+
offsets.each do |offset|
|
51
|
+
start, stop = offset
|
52
|
+
|
53
|
+
stop -= 1
|
54
|
+
|
55
|
+
start += 1 if string[start] == ' '
|
56
|
+
stop -= 1 if string[stop] == ' '
|
57
|
+
|
58
|
+
m['coverage'] << [start,stop]
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
matches += sub_matches
|
64
|
+
|
65
|
+
if skip_levels <= 0
|
66
|
+
opts[:limit] -= 1
|
67
|
+
end
|
68
|
+
|
69
|
+
if (opts[:limit]) == 0
|
70
|
+
break
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
if matches.empty? && skip_levels < 0
|
75
|
+
matches << { 'data' => sets.dup, 'coverage' => [], match_id: self.object_id}
|
76
|
+
end
|
77
|
+
|
78
|
+
matches
|
79
|
+
end
|
80
|
+
|
81
|
+
private
|
82
|
+
|
83
|
+
def test string, matcher
|
84
|
+
if matcher.is_a?(NoClass)
|
85
|
+
m = test(string,matcher.regex)
|
86
|
+
[[-1,-1]] if !m
|
87
|
+
elsif matcher.is_a?(AndClass)
|
88
|
+
r = matcher.regexes.map{|r| a=test(string,r); a[0] if a}
|
89
|
+
if r.all?{|r| r}
|
90
|
+
r.find_all{|r| r[0] != -1}
|
91
|
+
end
|
92
|
+
else
|
93
|
+
m = matcher.match(string)
|
94
|
+
[m.offset(0)] if m
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def set sub_hash
|
99
|
+
@context.sets.merge! stringified sub_hash
|
100
|
+
end
|
101
|
+
|
102
|
+
def match *args, &block
|
103
|
+
regexes = args.map{|r| to_regex(r)}
|
104
|
+
matcher = HashMatcher.new
|
105
|
+
old_context = @context
|
106
|
+
old_folder = @current_folder
|
107
|
+
@context.rules << [regexes, matcher]
|
108
|
+
@context = matcher
|
109
|
+
block.call
|
110
|
+
@context = old_context
|
111
|
+
@current_folder = old_folder
|
112
|
+
end
|
113
|
+
|
114
|
+
def w(regex) # make it match whole words
|
115
|
+
/(^| )#{regex.source}($| )/
|
116
|
+
end
|
117
|
+
|
118
|
+
def no(regex)
|
119
|
+
NoClass.new(to_regex(regex))
|
120
|
+
end
|
121
|
+
|
122
|
+
def both(*regexes)
|
123
|
+
AndClass.new(to_regex(regexes))
|
124
|
+
end
|
125
|
+
|
126
|
+
def to_regex(matcher)
|
127
|
+
if matcher.kind_of?(String)
|
128
|
+
/(^| )#{Regexp.escape(matcher)}($| )/
|
129
|
+
elsif matcher.kind_of?(Array)
|
130
|
+
matcher.map{|r| to_regex(r)}
|
131
|
+
else
|
132
|
+
matcher
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
def stringified hash
|
137
|
+
hash.keys.each do |key|
|
138
|
+
val = hash.delete(key)
|
139
|
+
hash[key.to_s] = val
|
140
|
+
end
|
141
|
+
hash
|
142
|
+
end
|
143
|
+
|
144
|
+
class NoClass
|
145
|
+
attr_reader :regex
|
146
|
+
|
147
|
+
def initialize regex
|
148
|
+
@regex = regex
|
149
|
+
end
|
150
|
+
|
151
|
+
def to_s
|
152
|
+
"!(#{@regex})"
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
class AndClass
|
157
|
+
attr_reader :regexes
|
158
|
+
|
159
|
+
def initialize regexes
|
160
|
+
@regexes = regexes
|
161
|
+
end
|
162
|
+
|
163
|
+
def to_s
|
164
|
+
@regexes.map{|r| r.inspect}.join(' AND ')
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
data/lib/hash_rules.rb
ADDED
@@ -0,0 +1,124 @@
|
|
1
|
+
require 'hash_matcher'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
class HashRules
|
5
|
+
|
6
|
+
def initialize args
|
7
|
+
@folder = args[:folder] || raise("No folder specified!")
|
8
|
+
|
9
|
+
@hashmatcher = HashMatcher.new
|
10
|
+
@hashmatcher.include_folder(@folder)
|
11
|
+
@cache = {}
|
12
|
+
end
|
13
|
+
|
14
|
+
def process string, opts={}
|
15
|
+
string = clean_string(string)
|
16
|
+
|
17
|
+
if cached=@cache[string]
|
18
|
+
return Marshal.load(cached)
|
19
|
+
end
|
20
|
+
|
21
|
+
result = Processor.new(string, @hashmatcher, opts).do
|
22
|
+
|
23
|
+
@cache[string] = Marshal.dump(result)
|
24
|
+
result
|
25
|
+
end
|
26
|
+
|
27
|
+
def clean_string string
|
28
|
+
string.gsub /\s+/, ' '
|
29
|
+
end
|
30
|
+
|
31
|
+
def to_s
|
32
|
+
"== HASHRULES ==" << @hashmatcher.to_s
|
33
|
+
end
|
34
|
+
|
35
|
+
class Processor
|
36
|
+
def initialize string, hashmatcher, opts
|
37
|
+
@string = string.dup
|
38
|
+
@hashmatcher = hashmatcher
|
39
|
+
@max_submatch_level = opts[:max_submatch_level] || 0
|
40
|
+
@limit = opts[:limit] || 1
|
41
|
+
@memory = []
|
42
|
+
end
|
43
|
+
|
44
|
+
def do
|
45
|
+
each_submatch_level do |submatch_level|
|
46
|
+
add_to_list(new_results = analyze(submatch_level))
|
47
|
+
break if reached_limit?
|
48
|
+
end
|
49
|
+
|
50
|
+
sort_by_coverage
|
51
|
+
|
52
|
+
list
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def analyze submatch_level
|
58
|
+
limit = @limit <= 0 ? -1 : (results_count() - @limit).abs
|
59
|
+
opts = {
|
60
|
+
skip_levels: submatch_level,
|
61
|
+
limit: limit
|
62
|
+
}
|
63
|
+
|
64
|
+
matches = @hashmatcher.analyze(@string, opts)
|
65
|
+
matches = matches.delete_if{|m| m['data'].empty?}
|
66
|
+
|
67
|
+
matches.each do |m|
|
68
|
+
coverage = Array.new(@string.length, false)
|
69
|
+
m['coverage'].each do |start, stop|
|
70
|
+
(start..stop).each do |i|
|
71
|
+
coverage[i] = true
|
72
|
+
end
|
73
|
+
end
|
74
|
+
(1..(coverage.count-2)).each do |i|
|
75
|
+
if coverage[i-1] && coverage[i+1]
|
76
|
+
coverage[i] = true
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
m['percent_coverage'] = coverage.find_all{|c| c}.count * 100 / @string.length
|
81
|
+
end
|
82
|
+
|
83
|
+
matches
|
84
|
+
end
|
85
|
+
|
86
|
+
def each_submatch_level &block
|
87
|
+
(0..@max_submatch_level).each &block
|
88
|
+
end
|
89
|
+
|
90
|
+
def reached_limit?
|
91
|
+
@limit > 0 && @memory.count >= @limit
|
92
|
+
end
|
93
|
+
|
94
|
+
def results_count
|
95
|
+
@memory.count
|
96
|
+
end
|
97
|
+
|
98
|
+
def add_to_list results
|
99
|
+
@memory += results
|
100
|
+
# results.each do |result|
|
101
|
+
# @memory << result unless @memory.any?{|m| m[:match_id] == result[:match_id]}
|
102
|
+
# end
|
103
|
+
end
|
104
|
+
|
105
|
+
def sort_by_coverage
|
106
|
+
@memory.sort! do |a,b|
|
107
|
+
result = b['coverage'].count <=> a['coverage'].count
|
108
|
+
if result == 0
|
109
|
+
result = b['percent_coverage'] <=> a['percent_coverage']
|
110
|
+
|
111
|
+
if result == 0
|
112
|
+
result = a['matchlevel'] <=> b['matchlevel']
|
113
|
+
end
|
114
|
+
end
|
115
|
+
result
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def list
|
120
|
+
@memory
|
121
|
+
end
|
122
|
+
|
123
|
+
end
|
124
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
|
2
|
+
match w(/pa ?28/) do # => /( |^)pa ?28( |$)/
|
3
|
+
set family: 'PA-28 Cherokee'
|
4
|
+
set engine_count: 1
|
5
|
+
set category: 'piston'
|
6
|
+
|
7
|
+
match 'pa28 181', 'pa 28 181' do
|
8
|
+
set model: 'PA-28-181'
|
9
|
+
set horsepower: 180
|
10
|
+
|
11
|
+
match /archer ii/, /archer 2/, /ii/ do
|
12
|
+
set model: 'PA-28-181 Archer II'
|
13
|
+
end
|
14
|
+
|
15
|
+
match /archer iii/, /archer 3/, /iii/ do
|
16
|
+
set model: 'PA-28-181 Archer III'
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,178 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'test_helper'
|
4
|
+
require 'hash_rules'
|
5
|
+
|
6
|
+
class HashRulesTest < TestCase
|
7
|
+
context 'hashrules, normal operation' do
|
8
|
+
|
9
|
+
setup do
|
10
|
+
@it = HashRules.new(folder: 'test/examples')
|
11
|
+
end
|
12
|
+
|
13
|
+
should 'return empty array when no match' do
|
14
|
+
assert_equal [], @it.process('curry')
|
15
|
+
end
|
16
|
+
|
17
|
+
should 'identify manufacturer' do
|
18
|
+
data = @it.process("1986 piper tjobahobo").first['data']
|
19
|
+
assert_equal 'Piper', data['manufacturer']
|
20
|
+
end
|
21
|
+
|
22
|
+
should 'identify perfect match' do
|
23
|
+
data = @it.process("piper pa28 181 ii").first['data']
|
24
|
+
assert_equal 'PA-28-181 Archer II', data['model']
|
25
|
+
end
|
26
|
+
|
27
|
+
should 'identify model' do
|
28
|
+
data = @it.process("2001 piper pa28 181 ii").first['data']
|
29
|
+
assert_equal 'PA-28-181 Archer II', data['model']
|
30
|
+
end
|
31
|
+
|
32
|
+
should 'allow numbers or letter next to match' do
|
33
|
+
data = @it.process("apiperloon").first['data']
|
34
|
+
assert_equal "Piper", data['manufacturer']
|
35
|
+
end
|
36
|
+
|
37
|
+
should 'match on string' do
|
38
|
+
data = @it.process("piper pa 28 181").first['data']
|
39
|
+
assert_equal "PA-28-181", data['model']
|
40
|
+
end
|
41
|
+
|
42
|
+
should 'discard double whitespace' do
|
43
|
+
data = @it.process("piper \t\r \npa 28\n 181").first['data']
|
44
|
+
assert_equal "PA-28-181", data['model']
|
45
|
+
end
|
46
|
+
|
47
|
+
should 'use "word" to make regexes match whole words' do
|
48
|
+
data = @it.process("piper apa-280").first['data']
|
49
|
+
assert_equal nil, data['family']
|
50
|
+
end
|
51
|
+
|
52
|
+
should 'present us with matched slices and percentage covered' do
|
53
|
+
data = @it.process("i'd like a piper in the pa28 family")
|
54
|
+
first = data.first
|
55
|
+
|
56
|
+
assert_equal 'Piper', first['data']['manufacturer']
|
57
|
+
assert_equal [[24, 27], [11, 15]], first['coverage']
|
58
|
+
assert_equal 25, first['percent_coverage']
|
59
|
+
end
|
60
|
+
|
61
|
+
should 'include spaces between matches in percentage' do
|
62
|
+
data = @it.process("oregon canada")
|
63
|
+
first = data.first
|
64
|
+
|
65
|
+
assert_equal 100, first['percent_coverage']
|
66
|
+
end
|
67
|
+
|
68
|
+
context 'both and no' do
|
69
|
+
should 'make operation trees' do
|
70
|
+
result = @it.process('person')
|
71
|
+
assert_equal 'Per', result.first['data']['manufacturer']
|
72
|
+
end
|
73
|
+
|
74
|
+
should 'use both with strings and regexes' do
|
75
|
+
data = @it.process('string rregexx').first['data']
|
76
|
+
assert_equal 'success', data['manufacturer']
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
should 'not allow numbers or letter next to STRING match' do
|
82
|
+
data = @it.process("piper apa-28 18100").first['data']
|
83
|
+
assert_equal nil, data['model']
|
84
|
+
end
|
85
|
+
|
86
|
+
should 'not allow several matches in the same context' do
|
87
|
+
# note: the rules are written in regex /ii/ which means it will also match /iii/
|
88
|
+
data = @it.process('piper pa28 181 archer iii', limit: 1).first['data']
|
89
|
+
assert_equal 'PA-28-181 Archer II', data['model']
|
90
|
+
end
|
91
|
+
|
92
|
+
context "case insensitive" do
|
93
|
+
should 'match cyrillic letters' do
|
94
|
+
data = @it.process("ми 8т").first['data']
|
95
|
+
assert_equal "Ми-8Т", data['manufacturer']
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
context 'multimatch' do
|
101
|
+
|
102
|
+
setup do
|
103
|
+
@it = HashRules.new(folder: 'test/examples')
|
104
|
+
end
|
105
|
+
|
106
|
+
should 'match several if indicated' do
|
107
|
+
r = @it.process('this should return two match', limit: -1)
|
108
|
+
assert_equal 2, r.count
|
109
|
+
first, second = r
|
110
|
+
|
111
|
+
assert_equal 'first', first['data']['match']
|
112
|
+
assert_equal 'second', second['data']['match']
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
context 'hashrules, submatch' do
|
117
|
+
|
118
|
+
setup do
|
119
|
+
@it = HashRules.new(folder: 'test/examples')
|
120
|
+
end
|
121
|
+
|
122
|
+
should 'match with adequate information, just as without submatch' do
|
123
|
+
r = @it.process('oregon ohio united states').first['data']
|
124
|
+
assert_equal 'United States', r['country']
|
125
|
+
assert_equal 'Ohio', r['region']
|
126
|
+
assert_equal 'Oregon', r['city']
|
127
|
+
|
128
|
+
r = @it.process('canada oregon').first['data']
|
129
|
+
assert_equal 'Canada', r['country']
|
130
|
+
assert_equal 'Oregon', r['region']
|
131
|
+
assert_equal nil, r['city']
|
132
|
+
end
|
133
|
+
|
134
|
+
should 'prefer to match on 2:nd level over 3:rd level' do
|
135
|
+
r = @it.process('oregon', max_submatch_level: 1, limit: 1).first['data']
|
136
|
+
|
137
|
+
# united states have a city called 'oregon', but which is on 3rd level,
|
138
|
+
# Oregon in canada is a state, and on the 2nd level
|
139
|
+
assert_equal 'Canada', r['country']
|
140
|
+
end
|
141
|
+
|
142
|
+
should 'prefer several multimatch over a single match and many characters' do
|
143
|
+
|
144
|
+
# there is no such place as 'or us' in canada, however there is or in united states
|
145
|
+
r = @it.process('canada or us', max_submatch_level: 6, limit: -1)
|
146
|
+
|
147
|
+
assert_equal 3, r.count
|
148
|
+
assert_equal 'United States', r[0]['data']['country']
|
149
|
+
end
|
150
|
+
|
151
|
+
should 'strive for 100% matach if possible' do
|
152
|
+
r = @it.process('oregon ohio', max_submatch_level: 5, limit: -1)
|
153
|
+
|
154
|
+
assert_equal 3, r.count
|
155
|
+
|
156
|
+
# Without intelligence, Oregon (canada) would be chosen because it's a state on the 2nd level, and Oregon in US is a city on 3rd level. However, we reason that if a 3rd level match can explain more about a string then it is reasonably more likely to be accurate.
|
157
|
+
|
158
|
+
assert_equal 'Ohio', r[0]['data']['region']
|
159
|
+
end
|
160
|
+
|
161
|
+
should 'backtrack coverage when found submatch' do
|
162
|
+
r = @it.process('pearson ohio', max_submatch_level: 6, limit: -1)
|
163
|
+
|
164
|
+
assert_equal [[0,6],[8,11]], r[0]['coverage']
|
165
|
+
end
|
166
|
+
|
167
|
+
should 'include matchlevel' do
|
168
|
+
r = @it.process('pearson ohio', max_submatch_level: 6, limit: -1)
|
169
|
+
assert_equal 2, r[0]['matchlevel']
|
170
|
+
|
171
|
+
r = @it.process('oregon', max_submatch_level: 6, limit: -1)
|
172
|
+
assert_equal 'Canada', r[0]['data']['country']
|
173
|
+
assert_equal 2, r[0]['matchlevel']
|
174
|
+
assert_equal 'United States', r[1]['data']['country']
|
175
|
+
assert_equal 3, r[1]['matchlevel']
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
data/test/test_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: hashrules
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.1.4
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Mikael Wikman
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-07-04 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Rule-based hash manipulator using custom DSL
|
14
|
+
email:
|
15
|
+
- mikael@swedcontent.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- CHANGELOG.mdown
|
21
|
+
- Gemfile
|
22
|
+
- Gemfile.lock
|
23
|
+
- LICENSE.txt
|
24
|
+
- README.mdown
|
25
|
+
- Rakefile
|
26
|
+
- app.gemspec
|
27
|
+
- lib/hash_matcher.rb
|
28
|
+
- lib/hash_rules.rb
|
29
|
+
- test/examples/canada/oregon.rb
|
30
|
+
- test/examples/cyrillic.rb
|
31
|
+
- test/examples/match_two.rb
|
32
|
+
- test/examples/piper.rb
|
33
|
+
- test/examples/piper/pa28.rb
|
34
|
+
- test/examples/united states/ohio.rb
|
35
|
+
- test/examples/united states/ohio/oregon.rb
|
36
|
+
- test/examples/world.rb
|
37
|
+
- test/hash_rules_test.rb
|
38
|
+
- test/test_helper.rb
|
39
|
+
homepage: https://github.com/mikaelwikman/hashrules
|
40
|
+
licenses: []
|
41
|
+
metadata: {}
|
42
|
+
post_install_message:
|
43
|
+
rdoc_options: []
|
44
|
+
require_paths:
|
45
|
+
- lib
|
46
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
47
|
+
requirements:
|
48
|
+
- - ! '>='
|
49
|
+
- !ruby/object:Gem::Version
|
50
|
+
version: '0'
|
51
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - ! '>='
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '0'
|
56
|
+
requirements: []
|
57
|
+
rubyforge_project:
|
58
|
+
rubygems_version: 2.0.3
|
59
|
+
signing_key:
|
60
|
+
specification_version: 4
|
61
|
+
summary: ''
|
62
|
+
test_files:
|
63
|
+
- test/examples/canada/oregon.rb
|
64
|
+
- test/examples/cyrillic.rb
|
65
|
+
- test/examples/match_two.rb
|
66
|
+
- test/examples/piper.rb
|
67
|
+
- test/examples/piper/pa28.rb
|
68
|
+
- test/examples/united states/ohio.rb
|
69
|
+
- test/examples/united states/ohio/oregon.rb
|
70
|
+
- test/examples/world.rb
|
71
|
+
- test/hash_rules_test.rb
|
72
|
+
- test/test_helper.rb
|