hashrules 1.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/CHANGELOG.mdown +52 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +37 -0
- data/LICENSE.txt +4 -0
- data/README.mdown +143 -0
- data/Rakefile +9 -0
- data/app.gemspec +16 -0
- data/lib/hash_matcher.rb +167 -0
- data/lib/hash_rules.rb +124 -0
- data/test/examples/canada/oregon.rb +3 -0
- data/test/examples/cyrillic.rb +4 -0
- data/test/examples/match_two.rb +8 -0
- data/test/examples/piper.rb +14 -0
- data/test/examples/piper/pa28.rb +19 -0
- data/test/examples/united states/ohio.rb +4 -0
- data/test/examples/united states/ohio/oregon.rb +7 -0
- data/test/examples/world.rb +10 -0
- data/test/hash_rules_test.rb +178 -0
- data/test/test_helper.rb +12 -0
- metadata +72 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
MDJhZmM2MWMyMTkzNjc3NjkwNjYzZmRlMjkwMjllNjFkZmJlNmRlYw==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
YjZiYTk1NDBkY2I1MzgzNmY4YmI5NGRhYTkyMDkwOGU4MTY0ODY5ZQ==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
YTgyMzk0NzQ3MDdiOWNiM2JlMjM3ZmM4ZTIwYTc3YWI2NDliNWEzNzE2OGRh
|
10
|
+
MWIwNGM3ZGI0MWU5YzhhYmFiMDBkN2ZkYzE1ZDUxYzE4ODVjOGVmMzg1Zjdk
|
11
|
+
ZTQ5MDdhNTc1MWE3YzNiMDBhZjA0MWM1ZjRjMjdmYmFmM2YwNDg=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
MzRiMGI0N2NmYTk5NDhkZWM5MDQxNDM3ODU4ZDM2Y2Q5YWQ5YmJiM2E2M2Vl
|
14
|
+
ZmQxMDQyYzZmZjk3ZDU3OTk3Yjc3Mzg5ZmMxZTVhOThkN2VhZWI4OWNhMTQz
|
15
|
+
M2U3NjNiNjRjMzgzMmEzZDNjZjYyMmZhMjM0NWM0N2Y3N2U2NWM=
|
data/CHANGELOG.mdown
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
|
2
|
+
# ChangeLog
|
3
|
+
|
4
|
+
## 1.1.4
|
5
|
+
|
6
|
+
* Bumped version due to need to republish to RubyGems (last version did not include all files)
|
7
|
+
|
8
|
+
## 1.1.3
|
9
|
+
|
10
|
+
* Improved cache hits
|
11
|
+
|
12
|
+
## 1.1.2
|
13
|
+
|
14
|
+
* Sort results that can explain more matches above those with less, even if the others covers more characters
|
15
|
+
|
16
|
+
## 1.1.1
|
17
|
+
|
18
|
+
* Percent coverage should include spaces between matches, otherwise a fully covered string would never reach 100%
|
19
|
+
|
20
|
+
## 1.1.0
|
21
|
+
|
22
|
+
* Added result caching
|
23
|
+
|
24
|
+
## 1.0.3
|
25
|
+
|
26
|
+
* Bugfix: Some cases would sort a low-level submatch over a higher one. Now matchlevel is set accordingly and sorted with if coverage are equal
|
27
|
+
|
28
|
+
## 1.0.2
|
29
|
+
|
30
|
+
* Bugfix: Coverage was not set for parent matchers in the case of a successful submatch. This meant that sometimes a first-level match would be ranked lower (treated as a submatch) when another submatch was found.
|
31
|
+
|
32
|
+
## 1.0.1
|
33
|
+
|
34
|
+
* Bugfix: When using smart search / submatching - don't count whitespace as coverage. This caused /germany/ in first level to gain less priority than /( |^)germany( |$)/ in USA (second level).
|
35
|
+
* Bugfix: Fix a certain case where a Limit set to 1 would gain -1 (unlimited) interpretation
|
36
|
+
|
37
|
+
## 1.0.0
|
38
|
+
|
39
|
+
This is a major version which breaks backwards compatibility.
|
40
|
+
|
41
|
+
* Changed API: HashRules#process now takes a single string and returns a new hash
|
42
|
+
* Added submatch feature to find results (deeply) nested even if a top level couldn't match.
|
43
|
+
* HashRules no longer cleans string before process (such as removing - and /), this is now the responsibility of the application.
|
44
|
+
|
45
|
+
## 0.2.3
|
46
|
+
|
47
|
+
* Fix issue with using both and strings (it would only accept no-class and regex before)
|
48
|
+
|
49
|
+
## 0.2
|
50
|
+
|
51
|
+
* Add matcher keywords both() for applying AND operator and no() for negating an expression
|
52
|
+
* Change name to sc-hashrules
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
GEM
|
2
|
+
remote: https://rubygems.org/
|
3
|
+
specs:
|
4
|
+
activesupport (3.2.9)
|
5
|
+
i18n (~> 0.6)
|
6
|
+
multi_json (~> 1.0)
|
7
|
+
ansi (1.4.3)
|
8
|
+
awesome_print (1.0.1)
|
9
|
+
bourne (1.1.2)
|
10
|
+
mocha (= 0.10.5)
|
11
|
+
htmlentities (4.3.1)
|
12
|
+
i18n (0.6.1)
|
13
|
+
metaclass (0.0.1)
|
14
|
+
mocha (0.10.5)
|
15
|
+
metaclass (~> 0.0.1)
|
16
|
+
multi_json (1.5.0)
|
17
|
+
rake (10.0.3)
|
18
|
+
shoulda (3.3.2)
|
19
|
+
shoulda-context (~> 1.0.1)
|
20
|
+
shoulda-matchers (~> 1.4.1)
|
21
|
+
shoulda-context (1.0.1)
|
22
|
+
shoulda-matchers (1.4.2)
|
23
|
+
activesupport (>= 3.0.0)
|
24
|
+
bourne (~> 1.1.2)
|
25
|
+
turn (0.9.6)
|
26
|
+
ansi
|
27
|
+
|
28
|
+
PLATFORMS
|
29
|
+
ruby
|
30
|
+
|
31
|
+
DEPENDENCIES
|
32
|
+
awesome_print
|
33
|
+
htmlentities
|
34
|
+
mocha
|
35
|
+
rake
|
36
|
+
shoulda
|
37
|
+
turn
|
data/LICENSE.txt
ADDED
data/README.mdown
ADDED
@@ -0,0 +1,143 @@
|
|
1
|
+
|
2
|
+
# HashRules
|
3
|
+
|
4
|
+
A simple, yet powerful, gem for building constructions for identifying contents in a string in a structured, hierarchical, manner.
|
5
|
+
|
6
|
+
## Example
|
7
|
+
|
8
|
+
Suppose you're building a system for identifying countries, regions and cities within a string. An exerpt might look like this:
|
9
|
+
|
10
|
+
```ruby
|
11
|
+
match 'united states', w(/u ?s ?a/) do
|
12
|
+
set country: 'United States'
|
13
|
+
|
14
|
+
match /( |^)cali/, 'ca' do
|
15
|
+
set state: 'California'
|
16
|
+
set region: 'West Coast'
|
17
|
+
|
18
|
+
match 'long beach', 'longbeach' do
|
19
|
+
set city: 'Long Beach'
|
20
|
+
end
|
21
|
+
|
22
|
+
match 'fontana' do
|
23
|
+
set city: 'Fontana'
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
```
|
28
|
+
|
29
|
+
This simple DSL translates to a series of OR and AND statements. For example, if we were to write the exact same logic but in plain if/then/else and regexes; the path for matching all the way to 'Long Beach' would be:
|
30
|
+
|
31
|
+
```ruby
|
32
|
+
text = "Long Beach, California, United States"
|
33
|
+
text.gsub!(',','')
|
34
|
+
result = {}
|
35
|
+
|
36
|
+
if (text =~ /( |^)united states( |$)/ || text =~ /( |^)u ?s ?a( |$)/)
|
37
|
+
result['country'] = 'United States'
|
38
|
+
|
39
|
+
if (text =~ /( |^)cali/ || text =~ /( |^)ca( |$)/)
|
40
|
+
result['state'] = 'California'
|
41
|
+
result['region'] = 'West Coast'
|
42
|
+
|
43
|
+
if (text =~ /( |^)long beach( |$)/ || text =~ /( |^)longbeach( |$)/)
|
44
|
+
result['city'] = 'Long Beach'
|
45
|
+
elsif text =~ /( |^)fontana( |$)/
|
46
|
+
result['city'] = 'Fontana'
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
```
|
51
|
+
|
52
|
+
## Available matchers
|
53
|
+
<table>
|
54
|
+
<tr>
|
55
|
+
<th>Notation</th><th>Equivalence</th><th>Use case</th>
|
56
|
+
</tr>
|
57
|
+
<tr>
|
58
|
+
<td>
|
59
|
+
w(regex)
|
60
|
+
</td>
|
61
|
+
<td>
|
62
|
+
/( |^)#{regex}( |$)/
|
63
|
+
</td>
|
64
|
+
<td>
|
65
|
+
Create regex matching only whole words
|
66
|
+
</td>
|
67
|
+
</tr>
|
68
|
+
|
69
|
+
<tr>
|
70
|
+
<td>
|
71
|
+
'regex'
|
72
|
+
</td>
|
73
|
+
<td>
|
74
|
+
/( |^)#{Regexp.escape(regex)}( |$)/
|
75
|
+
</td>
|
76
|
+
<td>
|
77
|
+
String matching whole words. 'key' would match "I have a key" but not "monkey"
|
78
|
+
</td>
|
79
|
+
</tr>
|
80
|
+
|
81
|
+
<tr>
|
82
|
+
<td>
|
83
|
+
no(matcher)
|
84
|
+
</td>
|
85
|
+
<td>
|
86
|
+
!(matcher.success?)
|
87
|
+
</td>
|
88
|
+
<td>
|
89
|
+
Inverted string or regex matcher
|
90
|
+
</td>
|
91
|
+
</tr>
|
92
|
+
|
93
|
+
<tr>
|
94
|
+
<td>
|
95
|
+
both(A,B)
|
96
|
+
</td>
|
97
|
+
<td>
|
98
|
+
A.success? && B.success?
|
99
|
+
</td>
|
100
|
+
<td>
|
101
|
+
Logical AND of given matchers, which may be nested
|
102
|
+
</td>
|
103
|
+
</tr>
|
104
|
+
</table>
|
105
|
+
|
106
|
+
## Installation
|
107
|
+
|
108
|
+
`gem install hashrules`
|
109
|
+
|
110
|
+
## Usage
|
111
|
+
|
112
|
+
```
|
113
|
+
require 'hash_rules'
|
114
|
+
|
115
|
+
rules = HashRules.new(args)
|
116
|
+
|
117
|
+
# args:
|
118
|
+
# * folder: path to the folder containing one or more rule files (all will be read)
|
119
|
+
|
120
|
+
process_args={}
|
121
|
+
results = rules.process('Alabama, Canada', process_args)
|
122
|
+
|
123
|
+
results.each do |result|
|
124
|
+
p result
|
125
|
+
end
|
126
|
+
```
|
127
|
+
|
128
|
+
Each result contains the following keys:
|
129
|
+
|
130
|
+
* data: This is the resulting data accumulated from the rules `set` statements
|
131
|
+
* coverage: An array of index pairs showing what part of the string was matched.
|
132
|
+
* match_id: Indicates the deepest matching rule id
|
133
|
+
* matchlevel: Indicates how deeply nested the first match was. If `max_submatch_level` is zero, this will always be 1.
|
134
|
+
* percent_coverage: A number 0 to 100 indicating how many percent of the string was matched
|
135
|
+
|
136
|
+
Process_args may be any of the following:
|
137
|
+
|
138
|
+
* max_submatch_level: Allow search to start from a nested matcher. If this value is 1, this means HashRules may skip at the very most one level. Recommended to be used with limit: -1
|
139
|
+
* limit: Stop search when this number of matches have been found. Good for performance, but not recommended if `max_submatch_level` is more than zero because results won't guarantee best match.
|
140
|
+
|
141
|
+
## License
|
142
|
+
|
143
|
+
<a rel="license" href="http://creativecommons.org/licenses/by-sa/3.0/deed.en_US"><img alt="Creative Commons License" style="border-width:0" src="http://i.creativecommons.org/l/by-sa/3.0/80x15.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" property="dct:title">HashRules</span> by <span xmlns:cc="http://creativecommons.org/ns#" property="cc:attributionName">Mikael Wikman</span> is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/3.0/deed.en_US">Creative Commons Attribution-ShareAlike 3.0 Unported License</a>.
|
data/Rakefile
ADDED
data/app.gemspec
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |gem|
|
4
|
+
gem.authors = ['Mikael Wikman']
|
5
|
+
gem.email = ['mikael@swedcontent.com']
|
6
|
+
gem.description = %q{Rule-based hash manipulator using custom DSL}
|
7
|
+
gem.summary = %q{ }
|
8
|
+
gem.homepage = "https://github.com/mikaelwikman/hashrules"
|
9
|
+
|
10
|
+
gem.files = `git ls-files`.split("\n")
|
11
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
12
|
+
gem.test_files = gem.files.grep(%r{^(test|features)/})
|
13
|
+
gem.name = "hashrules"
|
14
|
+
gem.require_paths = ["lib"]
|
15
|
+
gem.version = '1.1.4'
|
16
|
+
end
|
data/lib/hash_matcher.rb
ADDED
@@ -0,0 +1,167 @@
|
|
1
|
+
class HashMatcher
|
2
|
+
attr_reader :rules, :sets
|
3
|
+
|
4
|
+
def initialize
|
5
|
+
@rules = []
|
6
|
+
@sets = {}
|
7
|
+
@context = self
|
8
|
+
end
|
9
|
+
|
10
|
+
def include_folder folder
|
11
|
+
Dir["#{folder}/*.rb"].each do |file|
|
12
|
+
@current_folder = folder
|
13
|
+
contents = File.read(file)
|
14
|
+
eval(contents, binding)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def include_subfolder folder
|
19
|
+
include_folder "#{@current_folder}/#{folder}"
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_s i=0
|
23
|
+
result = ""
|
24
|
+
sets.each do |k,v|
|
25
|
+
result << "#{" "*i}#{k} = #{v}\n"
|
26
|
+
end
|
27
|
+
rules.each do |regexes, matcher|
|
28
|
+
result << "\n#{" "*i}If match #{regexes.to_s}\n"
|
29
|
+
result << matcher.to_s(i+1)
|
30
|
+
result << "#{" "*i}End\n"
|
31
|
+
end
|
32
|
+
result
|
33
|
+
end
|
34
|
+
|
35
|
+
def analyze string, opts={}, level=1
|
36
|
+
matches = []
|
37
|
+
opts[:limit] ||= 1
|
38
|
+
skip_levels = opts[:skip_levels] || 0
|
39
|
+
|
40
|
+
rules.each do |regexes, matcher|
|
41
|
+
offsets = []
|
42
|
+
if regexes.find{|r| offsets=test(string,r)} || skip_levels > 0
|
43
|
+
opts[:skip_levels] = skip_levels-1
|
44
|
+
sub_matches = matcher.analyze(string, opts, level+1)
|
45
|
+
|
46
|
+
sub_matches.map do |m|
|
47
|
+
m['data'] = sets.merge(m['data'])
|
48
|
+
if offsets
|
49
|
+
m['matchlevel'] = level
|
50
|
+
offsets.each do |offset|
|
51
|
+
start, stop = offset
|
52
|
+
|
53
|
+
stop -= 1
|
54
|
+
|
55
|
+
start += 1 if string[start] == ' '
|
56
|
+
stop -= 1 if string[stop] == ' '
|
57
|
+
|
58
|
+
m['coverage'] << [start,stop]
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
matches += sub_matches
|
64
|
+
|
65
|
+
if skip_levels <= 0
|
66
|
+
opts[:limit] -= 1
|
67
|
+
end
|
68
|
+
|
69
|
+
if (opts[:limit]) == 0
|
70
|
+
break
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
if matches.empty? && skip_levels < 0
|
75
|
+
matches << { 'data' => sets.dup, 'coverage' => [], match_id: self.object_id}
|
76
|
+
end
|
77
|
+
|
78
|
+
matches
|
79
|
+
end
|
80
|
+
|
81
|
+
private
|
82
|
+
|
83
|
+
def test string, matcher
|
84
|
+
if matcher.is_a?(NoClass)
|
85
|
+
m = test(string,matcher.regex)
|
86
|
+
[[-1,-1]] if !m
|
87
|
+
elsif matcher.is_a?(AndClass)
|
88
|
+
r = matcher.regexes.map{|r| a=test(string,r); a[0] if a}
|
89
|
+
if r.all?{|r| r}
|
90
|
+
r.find_all{|r| r[0] != -1}
|
91
|
+
end
|
92
|
+
else
|
93
|
+
m = matcher.match(string)
|
94
|
+
[m.offset(0)] if m
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def set sub_hash
|
99
|
+
@context.sets.merge! stringified sub_hash
|
100
|
+
end
|
101
|
+
|
102
|
+
def match *args, &block
|
103
|
+
regexes = args.map{|r| to_regex(r)}
|
104
|
+
matcher = HashMatcher.new
|
105
|
+
old_context = @context
|
106
|
+
old_folder = @current_folder
|
107
|
+
@context.rules << [regexes, matcher]
|
108
|
+
@context = matcher
|
109
|
+
block.call
|
110
|
+
@context = old_context
|
111
|
+
@current_folder = old_folder
|
112
|
+
end
|
113
|
+
|
114
|
+
def w(regex) # make it match whole words
|
115
|
+
/(^| )#{regex.source}($| )/
|
116
|
+
end
|
117
|
+
|
118
|
+
def no(regex)
|
119
|
+
NoClass.new(to_regex(regex))
|
120
|
+
end
|
121
|
+
|
122
|
+
def both(*regexes)
|
123
|
+
AndClass.new(to_regex(regexes))
|
124
|
+
end
|
125
|
+
|
126
|
+
def to_regex(matcher)
|
127
|
+
if matcher.kind_of?(String)
|
128
|
+
/(^| )#{Regexp.escape(matcher)}($| )/
|
129
|
+
elsif matcher.kind_of?(Array)
|
130
|
+
matcher.map{|r| to_regex(r)}
|
131
|
+
else
|
132
|
+
matcher
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
def stringified hash
|
137
|
+
hash.keys.each do |key|
|
138
|
+
val = hash.delete(key)
|
139
|
+
hash[key.to_s] = val
|
140
|
+
end
|
141
|
+
hash
|
142
|
+
end
|
143
|
+
|
144
|
+
class NoClass
|
145
|
+
attr_reader :regex
|
146
|
+
|
147
|
+
def initialize regex
|
148
|
+
@regex = regex
|
149
|
+
end
|
150
|
+
|
151
|
+
def to_s
|
152
|
+
"!(#{@regex})"
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
class AndClass
|
157
|
+
attr_reader :regexes
|
158
|
+
|
159
|
+
def initialize regexes
|
160
|
+
@regexes = regexes
|
161
|
+
end
|
162
|
+
|
163
|
+
def to_s
|
164
|
+
@regexes.map{|r| r.inspect}.join(' AND ')
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
data/lib/hash_rules.rb
ADDED
@@ -0,0 +1,124 @@
|
|
1
|
+
require 'hash_matcher'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
class HashRules
|
5
|
+
|
6
|
+
def initialize args
|
7
|
+
@folder = args[:folder] || raise("No folder specified!")
|
8
|
+
|
9
|
+
@hashmatcher = HashMatcher.new
|
10
|
+
@hashmatcher.include_folder(@folder)
|
11
|
+
@cache = {}
|
12
|
+
end
|
13
|
+
|
14
|
+
def process string, opts={}
|
15
|
+
string = clean_string(string)
|
16
|
+
|
17
|
+
if cached=@cache[string]
|
18
|
+
return Marshal.load(cached)
|
19
|
+
end
|
20
|
+
|
21
|
+
result = Processor.new(string, @hashmatcher, opts).do
|
22
|
+
|
23
|
+
@cache[string] = Marshal.dump(result)
|
24
|
+
result
|
25
|
+
end
|
26
|
+
|
27
|
+
def clean_string string
|
28
|
+
string.gsub /\s+/, ' '
|
29
|
+
end
|
30
|
+
|
31
|
+
def to_s
|
32
|
+
"== HASHRULES ==" << @hashmatcher.to_s
|
33
|
+
end
|
34
|
+
|
35
|
+
class Processor
|
36
|
+
def initialize string, hashmatcher, opts
|
37
|
+
@string = string.dup
|
38
|
+
@hashmatcher = hashmatcher
|
39
|
+
@max_submatch_level = opts[:max_submatch_level] || 0
|
40
|
+
@limit = opts[:limit] || 1
|
41
|
+
@memory = []
|
42
|
+
end
|
43
|
+
|
44
|
+
def do
|
45
|
+
each_submatch_level do |submatch_level|
|
46
|
+
add_to_list(new_results = analyze(submatch_level))
|
47
|
+
break if reached_limit?
|
48
|
+
end
|
49
|
+
|
50
|
+
sort_by_coverage
|
51
|
+
|
52
|
+
list
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def analyze submatch_level
|
58
|
+
limit = @limit <= 0 ? -1 : (results_count() - @limit).abs
|
59
|
+
opts = {
|
60
|
+
skip_levels: submatch_level,
|
61
|
+
limit: limit
|
62
|
+
}
|
63
|
+
|
64
|
+
matches = @hashmatcher.analyze(@string, opts)
|
65
|
+
matches = matches.delete_if{|m| m['data'].empty?}
|
66
|
+
|
67
|
+
matches.each do |m|
|
68
|
+
coverage = Array.new(@string.length, false)
|
69
|
+
m['coverage'].each do |start, stop|
|
70
|
+
(start..stop).each do |i|
|
71
|
+
coverage[i] = true
|
72
|
+
end
|
73
|
+
end
|
74
|
+
(1..(coverage.count-2)).each do |i|
|
75
|
+
if coverage[i-1] && coverage[i+1]
|
76
|
+
coverage[i] = true
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
m['percent_coverage'] = coverage.find_all{|c| c}.count * 100 / @string.length
|
81
|
+
end
|
82
|
+
|
83
|
+
matches
|
84
|
+
end
|
85
|
+
|
86
|
+
def each_submatch_level &block
|
87
|
+
(0..@max_submatch_level).each &block
|
88
|
+
end
|
89
|
+
|
90
|
+
def reached_limit?
|
91
|
+
@limit > 0 && @memory.count >= @limit
|
92
|
+
end
|
93
|
+
|
94
|
+
def results_count
|
95
|
+
@memory.count
|
96
|
+
end
|
97
|
+
|
98
|
+
def add_to_list results
|
99
|
+
@memory += results
|
100
|
+
# results.each do |result|
|
101
|
+
# @memory << result unless @memory.any?{|m| m[:match_id] == result[:match_id]}
|
102
|
+
# end
|
103
|
+
end
|
104
|
+
|
105
|
+
def sort_by_coverage
|
106
|
+
@memory.sort! do |a,b|
|
107
|
+
result = b['coverage'].count <=> a['coverage'].count
|
108
|
+
if result == 0
|
109
|
+
result = b['percent_coverage'] <=> a['percent_coverage']
|
110
|
+
|
111
|
+
if result == 0
|
112
|
+
result = a['matchlevel'] <=> b['matchlevel']
|
113
|
+
end
|
114
|
+
end
|
115
|
+
result
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def list
|
120
|
+
@memory
|
121
|
+
end
|
122
|
+
|
123
|
+
end
|
124
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
|
2
|
+
match w(/pa ?28/) do # => /( |^)pa ?28( |$)/
|
3
|
+
set family: 'PA-28 Cherokee'
|
4
|
+
set engine_count: 1
|
5
|
+
set category: 'piston'
|
6
|
+
|
7
|
+
match 'pa28 181', 'pa 28 181' do
|
8
|
+
set model: 'PA-28-181'
|
9
|
+
set horsepower: 180
|
10
|
+
|
11
|
+
match /archer ii/, /archer 2/, /ii/ do
|
12
|
+
set model: 'PA-28-181 Archer II'
|
13
|
+
end
|
14
|
+
|
15
|
+
match /archer iii/, /archer 3/, /iii/ do
|
16
|
+
set model: 'PA-28-181 Archer III'
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,178 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'test_helper'
|
4
|
+
require 'hash_rules'
|
5
|
+
|
6
|
+
class HashRulesTest < TestCase
|
7
|
+
context 'hashrules, normal operation' do
|
8
|
+
|
9
|
+
setup do
|
10
|
+
@it = HashRules.new(folder: 'test/examples')
|
11
|
+
end
|
12
|
+
|
13
|
+
should 'return empty array when no match' do
|
14
|
+
assert_equal [], @it.process('curry')
|
15
|
+
end
|
16
|
+
|
17
|
+
should 'identify manufacturer' do
|
18
|
+
data = @it.process("1986 piper tjobahobo").first['data']
|
19
|
+
assert_equal 'Piper', data['manufacturer']
|
20
|
+
end
|
21
|
+
|
22
|
+
should 'identify perfect match' do
|
23
|
+
data = @it.process("piper pa28 181 ii").first['data']
|
24
|
+
assert_equal 'PA-28-181 Archer II', data['model']
|
25
|
+
end
|
26
|
+
|
27
|
+
should 'identify model' do
|
28
|
+
data = @it.process("2001 piper pa28 181 ii").first['data']
|
29
|
+
assert_equal 'PA-28-181 Archer II', data['model']
|
30
|
+
end
|
31
|
+
|
32
|
+
should 'allow numbers or letter next to match' do
|
33
|
+
data = @it.process("apiperloon").first['data']
|
34
|
+
assert_equal "Piper", data['manufacturer']
|
35
|
+
end
|
36
|
+
|
37
|
+
should 'match on string' do
|
38
|
+
data = @it.process("piper pa 28 181").first['data']
|
39
|
+
assert_equal "PA-28-181", data['model']
|
40
|
+
end
|
41
|
+
|
42
|
+
should 'discard double whitespace' do
|
43
|
+
data = @it.process("piper \t\r \npa 28\n 181").first['data']
|
44
|
+
assert_equal "PA-28-181", data['model']
|
45
|
+
end
|
46
|
+
|
47
|
+
should 'use "word" to make regexes match whole words' do
|
48
|
+
data = @it.process("piper apa-280").first['data']
|
49
|
+
assert_equal nil, data['family']
|
50
|
+
end
|
51
|
+
|
52
|
+
should 'present us with matched slices and percentage covered' do
|
53
|
+
data = @it.process("i'd like a piper in the pa28 family")
|
54
|
+
first = data.first
|
55
|
+
|
56
|
+
assert_equal 'Piper', first['data']['manufacturer']
|
57
|
+
assert_equal [[24, 27], [11, 15]], first['coverage']
|
58
|
+
assert_equal 25, first['percent_coverage']
|
59
|
+
end
|
60
|
+
|
61
|
+
should 'include spaces between matches in percentage' do
|
62
|
+
data = @it.process("oregon canada")
|
63
|
+
first = data.first
|
64
|
+
|
65
|
+
assert_equal 100, first['percent_coverage']
|
66
|
+
end
|
67
|
+
|
68
|
+
context 'both and no' do
|
69
|
+
should 'make operation trees' do
|
70
|
+
result = @it.process('person')
|
71
|
+
assert_equal 'Per', result.first['data']['manufacturer']
|
72
|
+
end
|
73
|
+
|
74
|
+
should 'use both with strings and regexes' do
|
75
|
+
data = @it.process('string rregexx').first['data']
|
76
|
+
assert_equal 'success', data['manufacturer']
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
should 'not allow numbers or letter next to STRING match' do
|
82
|
+
data = @it.process("piper apa-28 18100").first['data']
|
83
|
+
assert_equal nil, data['model']
|
84
|
+
end
|
85
|
+
|
86
|
+
should 'not allow several matches in the same context' do
|
87
|
+
# note: the rules are written in regex /ii/ which means it will also match /iii/
|
88
|
+
data = @it.process('piper pa28 181 archer iii', limit: 1).first['data']
|
89
|
+
assert_equal 'PA-28-181 Archer II', data['model']
|
90
|
+
end
|
91
|
+
|
92
|
+
context "case insensitive" do
|
93
|
+
should 'match cyrillic letters' do
|
94
|
+
data = @it.process("ми 8т").first['data']
|
95
|
+
assert_equal "Ми-8Т", data['manufacturer']
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
context 'multimatch' do
|
101
|
+
|
102
|
+
setup do
|
103
|
+
@it = HashRules.new(folder: 'test/examples')
|
104
|
+
end
|
105
|
+
|
106
|
+
should 'match several if indicated' do
|
107
|
+
r = @it.process('this should return two match', limit: -1)
|
108
|
+
assert_equal 2, r.count
|
109
|
+
first, second = r
|
110
|
+
|
111
|
+
assert_equal 'first', first['data']['match']
|
112
|
+
assert_equal 'second', second['data']['match']
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
context 'hashrules, submatch' do
|
117
|
+
|
118
|
+
setup do
|
119
|
+
@it = HashRules.new(folder: 'test/examples')
|
120
|
+
end
|
121
|
+
|
122
|
+
should 'match with adequate information, just as without submatch' do
|
123
|
+
r = @it.process('oregon ohio united states').first['data']
|
124
|
+
assert_equal 'United States', r['country']
|
125
|
+
assert_equal 'Ohio', r['region']
|
126
|
+
assert_equal 'Oregon', r['city']
|
127
|
+
|
128
|
+
r = @it.process('canada oregon').first['data']
|
129
|
+
assert_equal 'Canada', r['country']
|
130
|
+
assert_equal 'Oregon', r['region']
|
131
|
+
assert_equal nil, r['city']
|
132
|
+
end
|
133
|
+
|
134
|
+
should 'prefer to match on 2:nd level over 3:rd level' do
|
135
|
+
r = @it.process('oregon', max_submatch_level: 1, limit: 1).first['data']
|
136
|
+
|
137
|
+
# united states have a city called 'oregon', but which is on 3rd level,
|
138
|
+
# Oregon in canada is a state, and on the 2nd level
|
139
|
+
assert_equal 'Canada', r['country']
|
140
|
+
end
|
141
|
+
|
142
|
+
should 'prefer several multimatch over a single match and many characters' do
|
143
|
+
|
144
|
+
# there is no such place as 'or us' in canada, however there is or in united states
|
145
|
+
r = @it.process('canada or us', max_submatch_level: 6, limit: -1)
|
146
|
+
|
147
|
+
assert_equal 3, r.count
|
148
|
+
assert_equal 'United States', r[0]['data']['country']
|
149
|
+
end
|
150
|
+
|
151
|
+
should 'strive for 100% matach if possible' do
|
152
|
+
r = @it.process('oregon ohio', max_submatch_level: 5, limit: -1)
|
153
|
+
|
154
|
+
assert_equal 3, r.count
|
155
|
+
|
156
|
+
# Without intelligence, Oregon (canada) would be chosen because it's a state on the 2nd level, and Oregon in US is a city on 3rd level. However, we reason that if a 3rd level match can explain more about a string then it is reasonably more likely to be accurate.
|
157
|
+
|
158
|
+
assert_equal 'Ohio', r[0]['data']['region']
|
159
|
+
end
|
160
|
+
|
161
|
+
should 'backtrack coverage when found submatch' do
|
162
|
+
r = @it.process('pearson ohio', max_submatch_level: 6, limit: -1)
|
163
|
+
|
164
|
+
assert_equal [[0,6],[8,11]], r[0]['coverage']
|
165
|
+
end
|
166
|
+
|
167
|
+
should 'include matchlevel' do
|
168
|
+
r = @it.process('pearson ohio', max_submatch_level: 6, limit: -1)
|
169
|
+
assert_equal 2, r[0]['matchlevel']
|
170
|
+
|
171
|
+
r = @it.process('oregon', max_submatch_level: 6, limit: -1)
|
172
|
+
assert_equal 'Canada', r[0]['data']['country']
|
173
|
+
assert_equal 2, r[0]['matchlevel']
|
174
|
+
assert_equal 'United States', r[1]['data']['country']
|
175
|
+
assert_equal 3, r[1]['matchlevel']
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
data/test/test_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: hashrules
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.1.4
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Mikael Wikman
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-07-04 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Rule-based hash manipulator using custom DSL
|
14
|
+
email:
|
15
|
+
- mikael@swedcontent.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- CHANGELOG.mdown
|
21
|
+
- Gemfile
|
22
|
+
- Gemfile.lock
|
23
|
+
- LICENSE.txt
|
24
|
+
- README.mdown
|
25
|
+
- Rakefile
|
26
|
+
- app.gemspec
|
27
|
+
- lib/hash_matcher.rb
|
28
|
+
- lib/hash_rules.rb
|
29
|
+
- test/examples/canada/oregon.rb
|
30
|
+
- test/examples/cyrillic.rb
|
31
|
+
- test/examples/match_two.rb
|
32
|
+
- test/examples/piper.rb
|
33
|
+
- test/examples/piper/pa28.rb
|
34
|
+
- test/examples/united states/ohio.rb
|
35
|
+
- test/examples/united states/ohio/oregon.rb
|
36
|
+
- test/examples/world.rb
|
37
|
+
- test/hash_rules_test.rb
|
38
|
+
- test/test_helper.rb
|
39
|
+
homepage: https://github.com/mikaelwikman/hashrules
|
40
|
+
licenses: []
|
41
|
+
metadata: {}
|
42
|
+
post_install_message:
|
43
|
+
rdoc_options: []
|
44
|
+
require_paths:
|
45
|
+
- lib
|
46
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
47
|
+
requirements:
|
48
|
+
- - ! '>='
|
49
|
+
- !ruby/object:Gem::Version
|
50
|
+
version: '0'
|
51
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - ! '>='
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '0'
|
56
|
+
requirements: []
|
57
|
+
rubyforge_project:
|
58
|
+
rubygems_version: 2.0.3
|
59
|
+
signing_key:
|
60
|
+
specification_version: 4
|
61
|
+
summary: ''
|
62
|
+
test_files:
|
63
|
+
- test/examples/canada/oregon.rb
|
64
|
+
- test/examples/cyrillic.rb
|
65
|
+
- test/examples/match_two.rb
|
66
|
+
- test/examples/piper.rb
|
67
|
+
- test/examples/piper/pa28.rb
|
68
|
+
- test/examples/united states/ohio.rb
|
69
|
+
- test/examples/united states/ohio/oregon.rb
|
70
|
+
- test/examples/world.rb
|
71
|
+
- test/hash_rules_test.rb
|
72
|
+
- test/test_helper.rb
|