boogex 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -3
- data/Gemfile.lock +1 -1
- data/boogex.gemspec +7 -1
- data/lib/boogex/convertor.rb +128 -15
- data/lib/boogex/version.rb +1 -1
- data/test/convertor_test.rb +29 -6
- data/test/reports/TEST-Boogex.xml +2 -8
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ceb29f8b52c10eb03a3f38dd239e310c9a5a3a3f
|
4
|
+
data.tar.gz: 99547f836bbc224c92989dc0c24aecdf2ddc6344
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c6401a4209df8e76f32598be839ece5982ec89ee555d210e9414b1ff1dace6a59644144627109d0d03eb05167e23ca4540eda7d1bc0de9f3df39126232837025
|
7
|
+
data.tar.gz: 7ff03e8740454cb1df6c30fc2d9ec036b18cf244a439e58b9f5a96739428f9dde11ad8b2e40650d37a62a9918e75a9c6e45e162e018d25ca79777972c77017cf
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
data/boogex.gemspec
CHANGED
@@ -19,8 +19,14 @@ Gem::Specification.new do |s|
|
|
19
19
|
|Rakefile
|
20
20
|
|\.gitignore
|
21
21
|
|\.rubocop.yml
|
22
|
+
|.*\.gem
|
23
|
+
|.*\.gemspec
|
24
|
+
)$}x
|
25
|
+
end
|
26
|
+
s.test_files = `git ls-files`.split($RS).reject do |file|
|
27
|
+
file =~ %r{^(?:
|
28
|
+
|.*\.gem
|
22
29
|
)$}x
|
23
30
|
end
|
24
|
-
s.test_files = `git ls-files`.split($RS)
|
25
31
|
s.require_paths = ['lib']
|
26
32
|
end
|
data/lib/boogex/convertor.rb
CHANGED
@@ -1,8 +1,34 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
3
|
module Boogex
|
4
|
+
AND_REGEX = / AND /
|
5
|
+
OR_REGEX = / OR /
|
6
|
+
NOT_REGEX = / NOT /
|
4
7
|
def self.convert(text)
|
5
|
-
|
8
|
+
texts = text.split(NOT_REGEX)
|
9
|
+
fail "The regex '#{text}' split more than twice on 'NOT'" if texts.size > 2
|
10
|
+
inclu_text = texts[0]
|
11
|
+
exclu_text = texts[1]
|
12
|
+
regex_hash = {
|
13
|
+
inclusive_regex: run_through_convertors(inclu_text)
|
14
|
+
}
|
15
|
+
|
16
|
+
unless exclu_text.nil?
|
17
|
+
regex_hash[:exclusive_regex] = run_through_convertors(exclu_text)
|
18
|
+
regex_hash[:no_links] = true if exclu_text.include?('HTTP')
|
19
|
+
validate_regex_syntax!(regex_hash[:exclusive_regex], text)
|
20
|
+
end
|
21
|
+
|
22
|
+
validate_regex_syntax!(regex_hash[:inclusive_regex], text)
|
23
|
+
regex_hash
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.validate_regex_syntax!(regex, text)
|
27
|
+
# Note: This also checks that the regex is valid and returns RegExpError if it isn't including a description of what went wrong.
|
28
|
+
fail "#{regex} matched on nothing or empty space. Huh?" if !' '.match(regex).nil?
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.run_through_convertors(text)
|
6
32
|
array = array_struct(text)
|
7
33
|
array = ors_to_pipes(array)
|
8
34
|
array = regex_formatting(array)
|
@@ -17,12 +43,12 @@ module Boogex
|
|
17
43
|
# "a OR (b AND (c OR d)) OR e" => ["a OR ", ["b AND ", ["c OR d"]], " OR e"]
|
18
44
|
def self.array_struct(text)
|
19
45
|
inside_brackets = "[^\(\)]*"
|
46
|
+
not_open_bracket = "[^\(]*"
|
20
47
|
|
21
48
|
#This regex looks for anything in brackets OR anything with brackets in brackets OR anything with brackets in brackets in brackets
|
22
|
-
regex =
|
49
|
+
regex = Regexp.new(get_bracket_regex)
|
23
50
|
|
24
51
|
cuts = text.scan(regex).to_a.flatten.reject(&:nil?)
|
25
|
-
|
26
52
|
# If nothing found then return orignal text
|
27
53
|
return text if cuts.empty?
|
28
54
|
|
@@ -34,18 +60,32 @@ module Boogex
|
|
34
60
|
result << str
|
35
61
|
else
|
36
62
|
splits = str.split(cut)
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
63
|
+
if splits.size == 2
|
64
|
+
result << splits.first
|
65
|
+
cut_without_brackets = cut[1..-2]
|
66
|
+
result << [cut_without_brackets]
|
67
|
+
result << splits.last
|
68
|
+
elsif splits.size == 1 && str.index(splits[0]) == 0
|
69
|
+
result << splits[0]
|
70
|
+
cut_without_brackets = cut[1..-2]
|
71
|
+
result << [cut_without_brackets]
|
72
|
+
elsif splits.size == 1 && str.index(splits[0]) > 0
|
73
|
+
cut_without_brackets = cut[1..-2]
|
74
|
+
result << [cut_without_brackets]
|
75
|
+
result << splits[0]
|
76
|
+
elsif splits.size == 0
|
77
|
+
cut_without_brackets = cut[1..-2]
|
78
|
+
result << [cut_without_brackets]
|
79
|
+
else
|
80
|
+
fail "This should never happen"
|
81
|
+
end
|
42
82
|
end
|
43
83
|
end
|
44
|
-
end.
|
84
|
+
end.compact
|
45
85
|
|
46
86
|
# This recursively converts any brackets in the text back into the array_struct function
|
47
87
|
# where the upper limit of recursion is 3 levels of bracketing. This is limitied by the regex
|
48
|
-
# on line 9 but can
|
88
|
+
# on line 9 but can be extended.
|
49
89
|
# If the element of the array is a string then no recursion to apply.
|
50
90
|
# If the element of the array is an array then iterate THAT through the array_struct function
|
51
91
|
text_array.reject(&:empty?).each_with_object([]) do |str, result|
|
@@ -62,7 +102,9 @@ module Boogex
|
|
62
102
|
|
63
103
|
# This function converts the Lucene Boolean `OR` into regex `|` and removes any quotation marks
|
64
104
|
def self.ors_to_pipes(obj)
|
65
|
-
return obj.gsub(
|
105
|
+
return obj.gsub(OR_REGEX, '|').gsub('"', '').gsub(/\-(?=([^\[]*\[[^\]]*\])*[^\[\]]*$)/, '\-').gsub("'", '') if obj.is_a?(String)
|
106
|
+
|
107
|
+
raise "There are unclosed brackets in this boolean string" if has_unclosed_brackets?(obj)
|
66
108
|
|
67
109
|
# This recursively applies this function to ensure all levels of the array are converted
|
68
110
|
obj.collect do |text|
|
@@ -70,6 +112,12 @@ module Boogex
|
|
70
112
|
end
|
71
113
|
end
|
72
114
|
|
115
|
+
def self.has_unclosed_brackets?(obj)
|
116
|
+
obj.any? do |o|
|
117
|
+
o.count('(') != o.count(')')
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
73
121
|
# This function begins to tranform the elements of the array structure to regex formatting
|
74
122
|
# including:
|
75
123
|
# - (a) Any elements that are not bookended by | are then wrapped in (?:) as this modularises
|
@@ -85,7 +133,7 @@ module Boogex
|
|
85
133
|
if obj.is_a?(String)
|
86
134
|
if contain_AND?(obj)
|
87
135
|
result = ['AND']
|
88
|
-
result = result + obj.split(
|
136
|
+
result = result + obj.split(AND_REGEX).reject(&:empty?).collect do |str|
|
89
137
|
regex_formatting(str)
|
90
138
|
end
|
91
139
|
return result
|
@@ -101,7 +149,7 @@ module Boogex
|
|
101
149
|
contain_AND?(str)
|
102
150
|
end
|
103
151
|
result = obj.each_with_object(['AND']) do |str, arr|
|
104
|
-
str.split(
|
152
|
+
str.split(AND_REGEX).reject(&:empty?).collect do |str|
|
105
153
|
arr << regex_formatting(str)
|
106
154
|
end
|
107
155
|
end
|
@@ -125,7 +173,7 @@ module Boogex
|
|
125
173
|
|
126
174
|
obj.each_with_object(result) do |text, result|
|
127
175
|
if contain_AND?(text)
|
128
|
-
text.split(
|
176
|
+
text.split(AND_REGEX).reject(&:empty?).each do |str|
|
129
177
|
result << regex_formatting(str)
|
130
178
|
end
|
131
179
|
else
|
@@ -168,7 +216,7 @@ module Boogex
|
|
168
216
|
end
|
169
217
|
|
170
218
|
def self.contain_AND?(obj)
|
171
|
-
obj.is_a?(String) && obj.
|
219
|
+
obj.is_a?(String) && obj.match(AND_REGEX)
|
172
220
|
end
|
173
221
|
|
174
222
|
def self.all_strings?(array)
|
@@ -189,4 +237,69 @@ module Boogex
|
|
189
237
|
def self.wrap_in_brackets(text)
|
190
238
|
'(?:' + text + ')'
|
191
239
|
end
|
240
|
+
|
241
|
+
def self.get_bracket_regex
|
242
|
+
@bracket_regex ||= generate_brack_regex
|
243
|
+
end
|
244
|
+
|
245
|
+
# This function generates the bracket regex. For simplicity, the regex for 'inside of a bracket' is represented by the
|
246
|
+
# string 'a', and the regex outside of a bracket is represented by the string 'b'. These are then substituted out at the end.
|
247
|
+
|
248
|
+
def self.generate_brack_regex
|
249
|
+
puts "Loading bracket regex..."
|
250
|
+
inside_brackets = "[^\(\)]*"
|
251
|
+
not_open_bracket = "[^\(]*"
|
252
|
+
|
253
|
+
get_bracket_inputs.collect do |input|
|
254
|
+
bracket_input_to_brackets(input).gsub('a', inside_brackets).gsub('b', not_open_bracket)
|
255
|
+
end.join('|')
|
256
|
+
end
|
257
|
+
|
258
|
+
# This function loads the valid permutations of the bracket regex where 0 represents an open bracket and 1 means closed bracket.
|
259
|
+
# All poosible permutations of bracket ordering are generated and then only valid bracket orderings are selected.
|
260
|
+
def self.get_bracket_inputs
|
261
|
+
inputs = []
|
262
|
+
(0..2000).to_a.each do |v|
|
263
|
+
result = v.to_s(2).split('').collect(&:to_i)
|
264
|
+
inputs << result
|
265
|
+
inputs << result.reverse unless result == result.reverse
|
266
|
+
inputs.uniq!
|
267
|
+
end
|
268
|
+
|
269
|
+
inputs.select { |v| valid?(v) }
|
270
|
+
end
|
271
|
+
|
272
|
+
def self.valid?(input)
|
273
|
+
# The total count of brackets must be even.
|
274
|
+
return false unless input.size.even?
|
275
|
+
|
276
|
+
# Only have 0's or 1's as inputs
|
277
|
+
return false if input.any? { |v| ![0, 1].include?(v) }
|
278
|
+
|
279
|
+
# The number of open brackets must equal the number of closed brackets
|
280
|
+
return false unless input.inject(0) { |n, v| n + v } == input.size / 2
|
281
|
+
|
282
|
+
# Can't start with a close bracket or end with an open bracket
|
283
|
+
return false if input.first == 1 || input.last == 0
|
284
|
+
|
285
|
+
sum = 0
|
286
|
+
valid = true
|
287
|
+
input.each_with_index do |v, i|
|
288
|
+
sum += v
|
289
|
+
comparison = (i + 1) / 2
|
290
|
+
valid = false if sum > comparison
|
291
|
+
end
|
292
|
+
valid
|
293
|
+
end
|
294
|
+
|
295
|
+
# This function first converts the sequence of 0's and 1's to open and close brackets.
|
296
|
+
# It then puts in a 'b' string between any close brackets that are followed by an open bracket.
|
297
|
+
# Finally it compresses any consecutive 'a's into a single 'a' as they are idempotent(ie. aaa == a).
|
298
|
+
|
299
|
+
def self.bracket_input_to_brackets(input)
|
300
|
+
brackets = ["\\(a", "a\\)"]
|
301
|
+
input.collect do |i|
|
302
|
+
brackets[i]
|
303
|
+
end.join('').gsub('\\)\\(', '\\)b\\(').gsub(/a+/, 'a')
|
304
|
+
end
|
192
305
|
end
|
data/lib/boogex/version.rb
CHANGED
data/test/convertor_test.rb
CHANGED
@@ -4,42 +4,65 @@ describe Boogex do
|
|
4
4
|
it 'turns OR into |' do
|
5
5
|
string = 'This OR That'
|
6
6
|
expecting = '(?:This|That)'
|
7
|
-
result = Boogex.convert(string)
|
7
|
+
result = Boogex.convert(string)[:inclusive_regex]
|
8
8
|
assert_equal expecting, result
|
9
9
|
end
|
10
10
|
|
11
11
|
it 'turns boolean AND into AND array string' do
|
12
12
|
string = 'This AND That'
|
13
13
|
expecting = "AND(['(?:This)','(?:That)'])"
|
14
|
-
result = Boogex.convert(string)
|
14
|
+
result = Boogex.convert(string)[:inclusive_regex]
|
15
15
|
assert_equal expecting, result
|
16
16
|
end
|
17
17
|
|
18
18
|
it 'understands bracketing' do
|
19
19
|
string = '(This OR That) AND My self'
|
20
20
|
expecting = "AND(['(?:This|That)','(?:My self)'])"
|
21
|
-
result = Boogex.convert(string)
|
21
|
+
result = Boogex.convert(string)[:inclusive_regex]
|
22
22
|
assert_equal expecting, result
|
23
23
|
end
|
24
24
|
|
25
25
|
it 'correctly convert this Lucene boolean query string' do
|
26
26
|
string = '(((asd OR dd) AND that) AND this) OR What?'
|
27
27
|
expecting = "AND([AND(['(?:asd|dd)','(?:that)']),'(?:this)'])|What?"
|
28
|
-
result = Boogex.convert(string)
|
28
|
+
result = Boogex.convert(string)[:inclusive_regex]
|
29
29
|
assert_equal expecting, result
|
30
30
|
end
|
31
31
|
|
32
32
|
it 'understands embedded AND' do
|
33
33
|
string = '((im AND researching) AND travelling)'
|
34
34
|
expecting = "AND([AND(['(?:im)','(?:researching)']),'(?:travelling)'])"
|
35
|
-
result = Boogex.convert(string)
|
35
|
+
result = Boogex.convert(string)[:inclusive_regex]
|
36
36
|
assert_equal expecting, result
|
37
37
|
end
|
38
38
|
|
39
39
|
it 'understands double embedded AND' do
|
40
40
|
string = 'Peeps OR ((dude OR roos) AND (what OR Footy))'
|
41
41
|
expecting = "Peeps|AND(['(?:dude|roos)','(?:what|Footy)'])"
|
42
|
-
result = Boogex.convert(string)
|
42
|
+
result = Boogex.convert(string)[:inclusive_regex]
|
43
|
+
assert_equal expecting, result
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'escapes hyphens' do
|
47
|
+
string = '("John Gordon-Smith" OR "hair") AND ("none" OR "all")'
|
48
|
+
expecting = "AND(['(?:John Gordon\\-Smith|hair)','(?:none|all)'])"
|
49
|
+
result = Boogex.convert(string)[:inclusive_regex]
|
50
|
+
assert_equal expecting, result
|
51
|
+
end
|
52
|
+
|
53
|
+
it 'handles unclosed brackets' do
|
54
|
+
string = '("@AUSOlympicTeam" OR "Olympics") AND ("boxing" OR "@Shelley__watts"))'
|
55
|
+
|
56
|
+
assert_raises do
|
57
|
+
result = Boogex.convert(string)[:inclusive_regex]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'handles multiple brackets' do
|
62
|
+
string = '((@officialcsa OR "south africa" OR "south african") AND ("cricket" OR "proteas") AND ("national" OR "international") AND ("team"))'
|
63
|
+
expecting = "AND(['(?:@officialcsa|south africa|south african)','(?:cricket|proteas)','(?:national|international)','(?:team)'])"
|
64
|
+
|
65
|
+
result = Boogex.convert(string)[:inclusive_regex]
|
43
66
|
assert_equal expecting, result
|
44
67
|
end
|
45
68
|
end
|
@@ -1,11 +1,5 @@
|
|
1
1
|
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
-
<testsuite name="Boogex" skipped="0" failures="0" errors="0" tests="
|
3
|
-
<testcase name="
|
4
|
-
</testcase>
|
5
|
-
<testcase name="test_0002_correctly convert this Lucene boolean query string" classname="Boogex" assertions="1" time="0.00011062994599342346">
|
6
|
-
</testcase>
|
7
|
-
<testcase name="test_0003_correctly convert this Lucene boolean query string" classname="Boogex" assertions="1" time="0.00020186102483421564">
|
8
|
-
</testcase>
|
9
|
-
<testcase name="test_0004_correctly convert this Lucene boolean query string" classname="Boogex" assertions="1" time="0.0003190739080309868">
|
2
|
+
<testsuite name="Boogex" skipped="0" failures="0" errors="0" tests="1" assertions="1" time="0.0007526259869337082">
|
3
|
+
<testcase name="test_0001_handles unclosed brackets" classname="Boogex" assertions="1" time="0.0007526259869337082">
|
10
4
|
</testcase>
|
11
5
|
</testsuite>
|