boogex 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -3
- data/Gemfile.lock +1 -1
- data/boogex.gemspec +7 -1
- data/lib/boogex/convertor.rb +128 -15
- data/lib/boogex/version.rb +1 -1
- data/test/convertor_test.rb +29 -6
- data/test/reports/TEST-Boogex.xml +2 -8
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ceb29f8b52c10eb03a3f38dd239e310c9a5a3a3f
|
4
|
+
data.tar.gz: 99547f836bbc224c92989dc0c24aecdf2ddc6344
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c6401a4209df8e76f32598be839ece5982ec89ee555d210e9414b1ff1dace6a59644144627109d0d03eb05167e23ca4540eda7d1bc0de9f3df39126232837025
|
7
|
+
data.tar.gz: 7ff03e8740454cb1df6c30fc2d9ec036b18cf244a439e58b9f5a96739428f9dde11ad8b2e40650d37a62a9918e75a9c6e45e162e018d25ca79777972c77017cf
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
data/boogex.gemspec
CHANGED
@@ -19,8 +19,14 @@ Gem::Specification.new do |s|
|
|
19
19
|
|Rakefile
|
20
20
|
|\.gitignore
|
21
21
|
|\.rubocop.yml
|
22
|
+
|.*\.gem
|
23
|
+
|.*\.gemspec
|
24
|
+
)$}x
|
25
|
+
end
|
26
|
+
s.test_files = `git ls-files`.split($RS).reject do |file|
|
27
|
+
file =~ %r{^(?:
|
28
|
+
|.*\.gem
|
22
29
|
)$}x
|
23
30
|
end
|
24
|
-
s.test_files = `git ls-files`.split($RS)
|
25
31
|
s.require_paths = ['lib']
|
26
32
|
end
|
data/lib/boogex/convertor.rb
CHANGED
@@ -1,8 +1,34 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
3
|
module Boogex
|
4
|
+
AND_REGEX = / AND /
|
5
|
+
OR_REGEX = / OR /
|
6
|
+
NOT_REGEX = / NOT /
|
4
7
|
def self.convert(text)
|
5
|
-
|
8
|
+
texts = text.split(NOT_REGEX)
|
9
|
+
fail "The regex '#{text}' split more than twice on 'NOT'" if texts.size > 2
|
10
|
+
inclu_text = texts[0]
|
11
|
+
exclu_text = texts[1]
|
12
|
+
regex_hash = {
|
13
|
+
inclusive_regex: run_through_convertors(inclu_text)
|
14
|
+
}
|
15
|
+
|
16
|
+
unless exclu_text.nil?
|
17
|
+
regex_hash[:exclusive_regex] = run_through_convertors(exclu_text)
|
18
|
+
regex_hash[:no_links] = true if exclu_text.include?('HTTP')
|
19
|
+
validate_regex_syntax!(regex_hash[:exclusive_regex], text)
|
20
|
+
end
|
21
|
+
|
22
|
+
validate_regex_syntax!(regex_hash[:inclusive_regex], text)
|
23
|
+
regex_hash
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.validate_regex_syntax!(regex, text)
|
27
|
+
# Note: This also checks that the regex is valid and returns RegExpError if it isn't including a description of what went wrong.
|
28
|
+
fail "#{regex} matched on nothing or empty space. Huh?" if !' '.match(regex).nil?
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.run_through_convertors(text)
|
6
32
|
array = array_struct(text)
|
7
33
|
array = ors_to_pipes(array)
|
8
34
|
array = regex_formatting(array)
|
@@ -17,12 +43,12 @@ module Boogex
|
|
17
43
|
# "a OR (b AND (c OR d)) OR e" => ["a OR ", ["b AND ", ["c OR d"]], " OR e"]
|
18
44
|
def self.array_struct(text)
|
19
45
|
inside_brackets = "[^\(\)]*"
|
46
|
+
not_open_bracket = "[^\(]*"
|
20
47
|
|
21
48
|
#This regex looks for anything in brackets OR anything with brackets in brackets OR anything with brackets in brackets in brackets
|
22
|
-
regex =
|
49
|
+
regex = Regexp.new(get_bracket_regex)
|
23
50
|
|
24
51
|
cuts = text.scan(regex).to_a.flatten.reject(&:nil?)
|
25
|
-
|
26
52
|
# If nothing found then return orignal text
|
27
53
|
return text if cuts.empty?
|
28
54
|
|
@@ -34,18 +60,32 @@ module Boogex
|
|
34
60
|
result << str
|
35
61
|
else
|
36
62
|
splits = str.split(cut)
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
63
|
+
if splits.size == 2
|
64
|
+
result << splits.first
|
65
|
+
cut_without_brackets = cut[1..-2]
|
66
|
+
result << [cut_without_brackets]
|
67
|
+
result << splits.last
|
68
|
+
elsif splits.size == 1 && str.index(splits[0]) == 0
|
69
|
+
result << splits[0]
|
70
|
+
cut_without_brackets = cut[1..-2]
|
71
|
+
result << [cut_without_brackets]
|
72
|
+
elsif splits.size == 1 && str.index(splits[0]) > 0
|
73
|
+
cut_without_brackets = cut[1..-2]
|
74
|
+
result << [cut_without_brackets]
|
75
|
+
result << splits[0]
|
76
|
+
elsif splits.size == 0
|
77
|
+
cut_without_brackets = cut[1..-2]
|
78
|
+
result << [cut_without_brackets]
|
79
|
+
else
|
80
|
+
fail "This should never happen"
|
81
|
+
end
|
42
82
|
end
|
43
83
|
end
|
44
|
-
end.
|
84
|
+
end.compact
|
45
85
|
|
46
86
|
# This recursively converts any brackets in the text back into the array_struct function
|
47
87
|
# where the upper limit of recursion is 3 levels of bracketing. This is limitied by the regex
|
48
|
-
# on line 9 but can
|
88
|
+
# on line 9 but can be extended.
|
49
89
|
# If the element of the array is a string then no recursion to apply.
|
50
90
|
# If the element of the array is an array then iterate THAT through the array_struct function
|
51
91
|
text_array.reject(&:empty?).each_with_object([]) do |str, result|
|
@@ -62,7 +102,9 @@ module Boogex
|
|
62
102
|
|
63
103
|
# This function converts the Lucene Boolean `OR` into regex `|` and removes any quotation marks
|
64
104
|
def self.ors_to_pipes(obj)
|
65
|
-
return obj.gsub(
|
105
|
+
return obj.gsub(OR_REGEX, '|').gsub('"', '').gsub(/\-(?=([^\[]*\[[^\]]*\])*[^\[\]]*$)/, '\-').gsub("'", '') if obj.is_a?(String)
|
106
|
+
|
107
|
+
raise "There are unclosed brackets in this boolean string" if has_unclosed_brackets?(obj)
|
66
108
|
|
67
109
|
# This recursively applies this function to ensure all levels of the array are converted
|
68
110
|
obj.collect do |text|
|
@@ -70,6 +112,12 @@ module Boogex
|
|
70
112
|
end
|
71
113
|
end
|
72
114
|
|
115
|
+
def self.has_unclosed_brackets?(obj)
|
116
|
+
obj.any? do |o|
|
117
|
+
o.count('(') != o.count(')')
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
73
121
|
# This function begins to tranform the elements of the array structure to regex formatting
|
74
122
|
# including:
|
75
123
|
# - (a) Any elements that are not bookended by | are then wrapped in (?:) as this modularises
|
@@ -85,7 +133,7 @@ module Boogex
|
|
85
133
|
if obj.is_a?(String)
|
86
134
|
if contain_AND?(obj)
|
87
135
|
result = ['AND']
|
88
|
-
result = result + obj.split(
|
136
|
+
result = result + obj.split(AND_REGEX).reject(&:empty?).collect do |str|
|
89
137
|
regex_formatting(str)
|
90
138
|
end
|
91
139
|
return result
|
@@ -101,7 +149,7 @@ module Boogex
|
|
101
149
|
contain_AND?(str)
|
102
150
|
end
|
103
151
|
result = obj.each_with_object(['AND']) do |str, arr|
|
104
|
-
str.split(
|
152
|
+
str.split(AND_REGEX).reject(&:empty?).collect do |str|
|
105
153
|
arr << regex_formatting(str)
|
106
154
|
end
|
107
155
|
end
|
@@ -125,7 +173,7 @@ module Boogex
|
|
125
173
|
|
126
174
|
obj.each_with_object(result) do |text, result|
|
127
175
|
if contain_AND?(text)
|
128
|
-
text.split(
|
176
|
+
text.split(AND_REGEX).reject(&:empty?).each do |str|
|
129
177
|
result << regex_formatting(str)
|
130
178
|
end
|
131
179
|
else
|
@@ -168,7 +216,7 @@ module Boogex
|
|
168
216
|
end
|
169
217
|
|
170
218
|
def self.contain_AND?(obj)
|
171
|
-
obj.is_a?(String) && obj.
|
219
|
+
obj.is_a?(String) && obj.match(AND_REGEX)
|
172
220
|
end
|
173
221
|
|
174
222
|
def self.all_strings?(array)
|
@@ -189,4 +237,69 @@ module Boogex
|
|
189
237
|
def self.wrap_in_brackets(text)
|
190
238
|
'(?:' + text + ')'
|
191
239
|
end
|
240
|
+
|
241
|
+
def self.get_bracket_regex
|
242
|
+
@bracket_regex ||= generate_brack_regex
|
243
|
+
end
|
244
|
+
|
245
|
+
# This function generates the bracket regex. For simplicity, the regex for 'inside of a bracket' is represented by the
|
246
|
+
# string 'a', and the regex outside of a bracket is represented by the string 'b'. These are then substituted out at the end.
|
247
|
+
|
248
|
+
def self.generate_brack_regex
|
249
|
+
puts "Loading bracket regex..."
|
250
|
+
inside_brackets = "[^\(\)]*"
|
251
|
+
not_open_bracket = "[^\(]*"
|
252
|
+
|
253
|
+
get_bracket_inputs.collect do |input|
|
254
|
+
bracket_input_to_brackets(input).gsub('a', inside_brackets).gsub('b', not_open_bracket)
|
255
|
+
end.join('|')
|
256
|
+
end
|
257
|
+
|
258
|
+
# This function loads the valid permutations of the bracket regex where 0 represents an open bracket and 1 means closed bracket.
|
259
|
+
# All poosible permutations of bracket ordering are generated and then only valid bracket orderings are selected.
|
260
|
+
def self.get_bracket_inputs
|
261
|
+
inputs = []
|
262
|
+
(0..2000).to_a.each do |v|
|
263
|
+
result = v.to_s(2).split('').collect(&:to_i)
|
264
|
+
inputs << result
|
265
|
+
inputs << result.reverse unless result == result.reverse
|
266
|
+
inputs.uniq!
|
267
|
+
end
|
268
|
+
|
269
|
+
inputs.select { |v| valid?(v) }
|
270
|
+
end
|
271
|
+
|
272
|
+
def self.valid?(input)
|
273
|
+
# The total count of brackets must be even.
|
274
|
+
return false unless input.size.even?
|
275
|
+
|
276
|
+
# Only have 0's or 1's as inputs
|
277
|
+
return false if input.any? { |v| ![0, 1].include?(v) }
|
278
|
+
|
279
|
+
# The number of open brackets must equal the number of closed brackets
|
280
|
+
return false unless input.inject(0) { |n, v| n + v } == input.size / 2
|
281
|
+
|
282
|
+
# Can't start with a close bracket or end with an open bracket
|
283
|
+
return false if input.first == 1 || input.last == 0
|
284
|
+
|
285
|
+
sum = 0
|
286
|
+
valid = true
|
287
|
+
input.each_with_index do |v, i|
|
288
|
+
sum += v
|
289
|
+
comparison = (i + 1) / 2
|
290
|
+
valid = false if sum > comparison
|
291
|
+
end
|
292
|
+
valid
|
293
|
+
end
|
294
|
+
|
295
|
+
# This function first converts the sequence of 0's and 1's to open and close brackets.
|
296
|
+
# It then puts in a 'b' string between any close brackets that are followed by an open bracket.
|
297
|
+
# Finally it compresses any consecutive 'a's into a single 'a' as they are idempotent(ie. aaa == a).
|
298
|
+
|
299
|
+
def self.bracket_input_to_brackets(input)
|
300
|
+
brackets = ["\\(a", "a\\)"]
|
301
|
+
input.collect do |i|
|
302
|
+
brackets[i]
|
303
|
+
end.join('').gsub('\\)\\(', '\\)b\\(').gsub(/a+/, 'a')
|
304
|
+
end
|
192
305
|
end
|
data/lib/boogex/version.rb
CHANGED
data/test/convertor_test.rb
CHANGED
@@ -4,42 +4,65 @@ describe Boogex do
|
|
4
4
|
it 'turns OR into |' do
|
5
5
|
string = 'This OR That'
|
6
6
|
expecting = '(?:This|That)'
|
7
|
-
result = Boogex.convert(string)
|
7
|
+
result = Boogex.convert(string)[:inclusive_regex]
|
8
8
|
assert_equal expecting, result
|
9
9
|
end
|
10
10
|
|
11
11
|
it 'turns boolean AND into AND array string' do
|
12
12
|
string = 'This AND That'
|
13
13
|
expecting = "AND(['(?:This)','(?:That)'])"
|
14
|
-
result = Boogex.convert(string)
|
14
|
+
result = Boogex.convert(string)[:inclusive_regex]
|
15
15
|
assert_equal expecting, result
|
16
16
|
end
|
17
17
|
|
18
18
|
it 'understands bracketing' do
|
19
19
|
string = '(This OR That) AND My self'
|
20
20
|
expecting = "AND(['(?:This|That)','(?:My self)'])"
|
21
|
-
result = Boogex.convert(string)
|
21
|
+
result = Boogex.convert(string)[:inclusive_regex]
|
22
22
|
assert_equal expecting, result
|
23
23
|
end
|
24
24
|
|
25
25
|
it 'correctly convert this Lucene boolean query string' do
|
26
26
|
string = '(((asd OR dd) AND that) AND this) OR What?'
|
27
27
|
expecting = "AND([AND(['(?:asd|dd)','(?:that)']),'(?:this)'])|What?"
|
28
|
-
result = Boogex.convert(string)
|
28
|
+
result = Boogex.convert(string)[:inclusive_regex]
|
29
29
|
assert_equal expecting, result
|
30
30
|
end
|
31
31
|
|
32
32
|
it 'understands embedded AND' do
|
33
33
|
string = '((im AND researching) AND travelling)'
|
34
34
|
expecting = "AND([AND(['(?:im)','(?:researching)']),'(?:travelling)'])"
|
35
|
-
result = Boogex.convert(string)
|
35
|
+
result = Boogex.convert(string)[:inclusive_regex]
|
36
36
|
assert_equal expecting, result
|
37
37
|
end
|
38
38
|
|
39
39
|
it 'understands double embedded AND' do
|
40
40
|
string = 'Peeps OR ((dude OR roos) AND (what OR Footy))'
|
41
41
|
expecting = "Peeps|AND(['(?:dude|roos)','(?:what|Footy)'])"
|
42
|
-
result = Boogex.convert(string)
|
42
|
+
result = Boogex.convert(string)[:inclusive_regex]
|
43
|
+
assert_equal expecting, result
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'escapes hyphens' do
|
47
|
+
string = '("John Gordon-Smith" OR "hair") AND ("none" OR "all")'
|
48
|
+
expecting = "AND(['(?:John Gordon\\-Smith|hair)','(?:none|all)'])"
|
49
|
+
result = Boogex.convert(string)[:inclusive_regex]
|
50
|
+
assert_equal expecting, result
|
51
|
+
end
|
52
|
+
|
53
|
+
it 'handles unclosed brackets' do
|
54
|
+
string = '("@AUSOlympicTeam" OR "Olympics") AND ("boxing" OR "@Shelley__watts"))'
|
55
|
+
|
56
|
+
assert_raises do
|
57
|
+
result = Boogex.convert(string)[:inclusive_regex]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'handles multiple brackets' do
|
62
|
+
string = '((@officialcsa OR "south africa" OR "south african") AND ("cricket" OR "proteas") AND ("national" OR "international") AND ("team"))'
|
63
|
+
expecting = "AND(['(?:@officialcsa|south africa|south african)','(?:cricket|proteas)','(?:national|international)','(?:team)'])"
|
64
|
+
|
65
|
+
result = Boogex.convert(string)[:inclusive_regex]
|
43
66
|
assert_equal expecting, result
|
44
67
|
end
|
45
68
|
end
|
@@ -1,11 +1,5 @@
|
|
1
1
|
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
-
<testsuite name="Boogex" skipped="0" failures="0" errors="0" tests="
|
3
|
-
<testcase name="
|
4
|
-
</testcase>
|
5
|
-
<testcase name="test_0002_correctly convert this Lucene boolean query string" classname="Boogex" assertions="1" time="0.00011062994599342346">
|
6
|
-
</testcase>
|
7
|
-
<testcase name="test_0003_correctly convert this Lucene boolean query string" classname="Boogex" assertions="1" time="0.00020186102483421564">
|
8
|
-
</testcase>
|
9
|
-
<testcase name="test_0004_correctly convert this Lucene boolean query string" classname="Boogex" assertions="1" time="0.0003190739080309868">
|
2
|
+
<testsuite name="Boogex" skipped="0" failures="0" errors="0" tests="1" assertions="1" time="0.0007526259869337082">
|
3
|
+
<testcase name="test_0001_handles unclosed brackets" classname="Boogex" assertions="1" time="0.0007526259869337082">
|
10
4
|
</testcase>
|
11
5
|
</testsuite>
|