boogex 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 83f81d847ac61e973b5d4db503c9f03632308c9e
4
- data.tar.gz: 09d2def1e45e8bd365b67f94f0157072c3ecfb31
3
+ metadata.gz: ceb29f8b52c10eb03a3f38dd239e310c9a5a3a3f
4
+ data.tar.gz: 99547f836bbc224c92989dc0c24aecdf2ddc6344
5
5
  SHA512:
6
- metadata.gz: 525ffc546863d4719e249a92b6939b44e3d716504dc53e776ddc11234090a94bb000638973233b234b7a597f97cbc4289e63a103d4e82c7bb7cf2782674800f8
7
- data.tar.gz: b6ef7c7cba2d1948080e5dba10610f9b5da545b590253a67b2c7d3cf9fea5cb8e25b3f2cd0b33679a6ebade54ce06a230e69c859fcddbcc36c70500a10da7266
6
+ metadata.gz: c6401a4209df8e76f32598be839ece5982ec89ee555d210e9414b1ff1dace6a59644144627109d0d03eb05167e23ca4540eda7d1bc0de9f3df39126232837025
7
+ data.tar.gz: 7ff03e8740454cb1df6c30fc2d9ec036b18cf244a439e58b9f5a96739428f9dde11ad8b2e40650d37a62a9918e75a9c6e45e162e018d25ca79777972c77017cf
data/.gitignore CHANGED
@@ -1,3 +1 @@
1
- **.DS_Store
2
- **.gem
3
-
1
+ **/.DS_Store
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- boogex (0.1.0)
4
+ boogex (0.1.1)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -19,8 +19,14 @@ Gem::Specification.new do |s|
19
19
  |Rakefile
20
20
  |\.gitignore
21
21
  |\.rubocop.yml
22
+ |.*\.gem
23
+ |.*\.gemspec
24
+ )$}x
25
+ end
26
+ s.test_files = `git ls-files`.split($RS).reject do |file|
27
+ file =~ %r{^(?:
28
+ |.*\.gem
22
29
  )$}x
23
30
  end
24
- s.test_files = `git ls-files`.split($RS)
25
31
  s.require_paths = ['lib']
26
32
  end
@@ -1,8 +1,34 @@
1
1
  # encoding: utf-8
2
2
 
3
3
  module Boogex
4
+ AND_REGEX = / AND /
5
+ OR_REGEX = / OR /
6
+ NOT_REGEX = / NOT /
4
7
  def self.convert(text)
5
- puts "Converting \"#{text}\" into regex"
8
+ texts = text.split(NOT_REGEX)
9
+ fail "The regex '#{text}' split more than twice on 'NOT'" if texts.size > 2
10
+ inclu_text = texts[0]
11
+ exclu_text = texts[1]
12
+ regex_hash = {
13
+ inclusive_regex: run_through_convertors(inclu_text)
14
+ }
15
+
16
+ unless exclu_text.nil?
17
+ regex_hash[:exclusive_regex] = run_through_convertors(exclu_text)
18
+ regex_hash[:no_links] = true if exclu_text.include?('HTTP')
19
+ validate_regex_syntax!(regex_hash[:exclusive_regex], text)
20
+ end
21
+
22
+ validate_regex_syntax!(regex_hash[:inclusive_regex], text)
23
+ regex_hash
24
+ end
25
+
26
+ def self.validate_regex_syntax!(regex, text)
27
+ # Note: This also checks that the regex is valid and returns RegExpError if it isn't including a description of what went wrong.
28
+ fail "#{regex} matched on nothing or empty space. Huh?" if !' '.match(regex).nil?
29
+ end
30
+
31
+ def self.run_through_convertors(text)
6
32
  array = array_struct(text)
7
33
  array = ors_to_pipes(array)
8
34
  array = regex_formatting(array)
@@ -17,12 +43,12 @@ module Boogex
17
43
  # "a OR (b AND (c OR d)) OR e" => ["a OR ", ["b AND ", ["c OR d"]], " OR e"]
18
44
  def self.array_struct(text)
19
45
  inside_brackets = "[^\(\)]*"
46
+ not_open_bracket = "[^\(]*"
20
47
 
21
48
  #This regex looks for anything in brackets OR anything with brackets in brackets OR anything with brackets in brackets in brackets
22
- regex = /(\(#{inside_brackets}\))|(\(#{inside_brackets}\(#{inside_brackets}\)#{inside_brackets}\))|(\(#{inside_brackets}\(#{inside_brackets}\)[^\(]*\(#{inside_brackets}\)#{inside_brackets}\))|(\(#{inside_brackets}\(#{inside_brackets}\(#{inside_brackets}\)#{inside_brackets}\)#{inside_brackets}\))/
49
+ regex = Regexp.new(get_bracket_regex)
23
50
 
24
51
  cuts = text.scan(regex).to_a.flatten.reject(&:nil?)
25
-
26
52
  # If nothing found then return orignal text
27
53
  return text if cuts.empty?
28
54
 
@@ -34,18 +60,32 @@ module Boogex
34
60
  result << str
35
61
  else
36
62
  splits = str.split(cut)
37
-
38
- result << splits.first
39
- cut_without_brackets = cut[1..-2]
40
- result << [cut_without_brackets]
41
- result << splits.last
63
+ if splits.size == 2
64
+ result << splits.first
65
+ cut_without_brackets = cut[1..-2]
66
+ result << [cut_without_brackets]
67
+ result << splits.last
68
+ elsif splits.size == 1 && str.index(splits[0]) == 0
69
+ result << splits[0]
70
+ cut_without_brackets = cut[1..-2]
71
+ result << [cut_without_brackets]
72
+ elsif splits.size == 1 && str.index(splits[0]) > 0
73
+ cut_without_brackets = cut[1..-2]
74
+ result << [cut_without_brackets]
75
+ result << splits[0]
76
+ elsif splits.size == 0
77
+ cut_without_brackets = cut[1..-2]
78
+ result << [cut_without_brackets]
79
+ else
80
+ fail "This should never happen"
81
+ end
42
82
  end
43
83
  end
44
- end.uniq.compact
84
+ end.compact
45
85
 
46
86
  # This recursively converts any brackets in the text back into the array_struct function
47
87
  # where the upper limit of recursion is 3 levels of bracketing. This is limitied by the regex
48
- # on line 9 but can easily be extended.
88
+ # on line 9 but can be extended.
49
89
  # If the element of the array is a string then no recursion to apply.
50
90
  # If the element of the array is an array then iterate THAT through the array_struct function
51
91
  text_array.reject(&:empty?).each_with_object([]) do |str, result|
@@ -62,7 +102,9 @@ module Boogex
62
102
 
63
103
  # This function converts the Lucene Boolean `OR` into regex `|` and removes any quotation marks
64
104
  def self.ors_to_pipes(obj)
65
- return obj.gsub(' OR ', '|').gsub('"', '').gsub("'", '') if obj.is_a?(String)
105
+ return obj.gsub(OR_REGEX, '|').gsub('"', '').gsub(/\-(?=([^\[]*\[[^\]]*\])*[^\[\]]*$)/, '\-').gsub("'", '') if obj.is_a?(String)
106
+
107
+ raise "There are unclosed brackets in this boolean string" if has_unclosed_brackets?(obj)
66
108
 
67
109
  # This recursively applies this function to ensure all levels of the array are converted
68
110
  obj.collect do |text|
@@ -70,6 +112,12 @@ module Boogex
70
112
  end
71
113
  end
72
114
 
115
+ def self.has_unclosed_brackets?(obj)
116
+ obj.any? do |o|
117
+ o.count('(') != o.count(')')
118
+ end
119
+ end
120
+
73
121
  # This function begins to tranform the elements of the array structure to regex formatting
74
122
  # including:
75
123
  # - (a) Any elements that are not bookended by | are then wrapped in (?:) as this modularises
@@ -85,7 +133,7 @@ module Boogex
85
133
  if obj.is_a?(String)
86
134
  if contain_AND?(obj)
87
135
  result = ['AND']
88
- result = result + obj.split(' AND ').reject(&:empty?).collect do |str|
136
+ result = result + obj.split(AND_REGEX).reject(&:empty?).collect do |str|
89
137
  regex_formatting(str)
90
138
  end
91
139
  return result
@@ -101,7 +149,7 @@ module Boogex
101
149
  contain_AND?(str)
102
150
  end
103
151
  result = obj.each_with_object(['AND']) do |str, arr|
104
- str.split(' AND ').reject(&:empty?).collect do |str|
152
+ str.split(AND_REGEX).reject(&:empty?).collect do |str|
105
153
  arr << regex_formatting(str)
106
154
  end
107
155
  end
@@ -125,7 +173,7 @@ module Boogex
125
173
 
126
174
  obj.each_with_object(result) do |text, result|
127
175
  if contain_AND?(text)
128
- text.split(' AND ').reject(&:empty?).each do |str|
176
+ text.split(AND_REGEX).reject(&:empty?).each do |str|
129
177
  result << regex_formatting(str)
130
178
  end
131
179
  else
@@ -168,7 +216,7 @@ module Boogex
168
216
  end
169
217
 
170
218
  def self.contain_AND?(obj)
171
- obj.is_a?(String) && obj.include?(' AND ')
219
+ obj.is_a?(String) && obj.match(AND_REGEX)
172
220
  end
173
221
 
174
222
  def self.all_strings?(array)
@@ -189,4 +237,69 @@ module Boogex
189
237
  def self.wrap_in_brackets(text)
190
238
  '(?:' + text + ')'
191
239
  end
240
+
241
+ def self.get_bracket_regex
242
+ @bracket_regex ||= generate_brack_regex
243
+ end
244
+
245
+ # This function generates the bracket regex. For simplicity, the regex for 'inside of a bracket' is represented by the
246
+ # string 'a', and the regex outside of a bracket is represented by the string 'b'. These are then substituted out at the end.
247
+
248
+ def self.generate_brack_regex
249
+ puts "Loading bracket regex..."
250
+ inside_brackets = "[^\(\)]*"
251
+ not_open_bracket = "[^\(]*"
252
+
253
+ get_bracket_inputs.collect do |input|
254
+ bracket_input_to_brackets(input).gsub('a', inside_brackets).gsub('b', not_open_bracket)
255
+ end.join('|')
256
+ end
257
+
258
+ # This function loads the valid permutations of the bracket regex where 0 represents an open bracket and 1 means closed bracket.
259
+ # All poosible permutations of bracket ordering are generated and then only valid bracket orderings are selected.
260
+ def self.get_bracket_inputs
261
+ inputs = []
262
+ (0..2000).to_a.each do |v|
263
+ result = v.to_s(2).split('').collect(&:to_i)
264
+ inputs << result
265
+ inputs << result.reverse unless result == result.reverse
266
+ inputs.uniq!
267
+ end
268
+
269
+ inputs.select { |v| valid?(v) }
270
+ end
271
+
272
+ def self.valid?(input)
273
+ # The total count of brackets must be even.
274
+ return false unless input.size.even?
275
+
276
+ # Only have 0's or 1's as inputs
277
+ return false if input.any? { |v| ![0, 1].include?(v) }
278
+
279
+ # The number of open brackets must equal the number of closed brackets
280
+ return false unless input.inject(0) { |n, v| n + v } == input.size / 2
281
+
282
+ # Can't start with a close bracket or end with an open bracket
283
+ return false if input.first == 1 || input.last == 0
284
+
285
+ sum = 0
286
+ valid = true
287
+ input.each_with_index do |v, i|
288
+ sum += v
289
+ comparison = (i + 1) / 2
290
+ valid = false if sum > comparison
291
+ end
292
+ valid
293
+ end
294
+
295
+ # This function first converts the sequence of 0's and 1's to open and close brackets.
296
+ # It then puts in a 'b' string between any close brackets that are followed by an open bracket.
297
+ # Finally it compresses any consecutive 'a's into a single 'a' as they are idempotent(ie. aaa == a).
298
+
299
+ def self.bracket_input_to_brackets(input)
300
+ brackets = ["\\(a", "a\\)"]
301
+ input.collect do |i|
302
+ brackets[i]
303
+ end.join('').gsub('\\)\\(', '\\)b\\(').gsub(/a+/, 'a')
304
+ end
192
305
  end
@@ -1,4 +1,4 @@
1
1
  # encoding: utf-8
2
2
  module Boogex
3
- VERSION = '0.1.1'
3
+ VERSION = '0.1.2'
4
4
  end
@@ -4,42 +4,65 @@ describe Boogex do
4
4
  it 'turns OR into |' do
5
5
  string = 'This OR That'
6
6
  expecting = '(?:This|That)'
7
- result = Boogex.convert(string)
7
+ result = Boogex.convert(string)[:inclusive_regex]
8
8
  assert_equal expecting, result
9
9
  end
10
10
 
11
11
  it 'turns boolean AND into AND array string' do
12
12
  string = 'This AND That'
13
13
  expecting = "AND(['(?:This)','(?:That)'])"
14
- result = Boogex.convert(string)
14
+ result = Boogex.convert(string)[:inclusive_regex]
15
15
  assert_equal expecting, result
16
16
  end
17
17
 
18
18
  it 'understands bracketing' do
19
19
  string = '(This OR That) AND My self'
20
20
  expecting = "AND(['(?:This|That)','(?:My self)'])"
21
- result = Boogex.convert(string)
21
+ result = Boogex.convert(string)[:inclusive_regex]
22
22
  assert_equal expecting, result
23
23
  end
24
24
 
25
25
  it 'correctly convert this Lucene boolean query string' do
26
26
  string = '(((asd OR dd) AND that) AND this) OR What?'
27
27
  expecting = "AND([AND(['(?:asd|dd)','(?:that)']),'(?:this)'])|What?"
28
- result = Boogex.convert(string)
28
+ result = Boogex.convert(string)[:inclusive_regex]
29
29
  assert_equal expecting, result
30
30
  end
31
31
 
32
32
  it 'understands embedded AND' do
33
33
  string = '((im AND researching) AND travelling)'
34
34
  expecting = "AND([AND(['(?:im)','(?:researching)']),'(?:travelling)'])"
35
- result = Boogex.convert(string)
35
+ result = Boogex.convert(string)[:inclusive_regex]
36
36
  assert_equal expecting, result
37
37
  end
38
38
 
39
39
  it 'understands double embedded AND' do
40
40
  string = 'Peeps OR ((dude OR roos) AND (what OR Footy))'
41
41
  expecting = "Peeps|AND(['(?:dude|roos)','(?:what|Footy)'])"
42
- result = Boogex.convert(string)
42
+ result = Boogex.convert(string)[:inclusive_regex]
43
+ assert_equal expecting, result
44
+ end
45
+
46
+ it 'escapes hyphens' do
47
+ string = '("John Gordon-Smith" OR "hair") AND ("none" OR "all")'
48
+ expecting = "AND(['(?:John Gordon\\-Smith|hair)','(?:none|all)'])"
49
+ result = Boogex.convert(string)[:inclusive_regex]
50
+ assert_equal expecting, result
51
+ end
52
+
53
+ it 'handles unclosed brackets' do
54
+ string = '("@AUSOlympicTeam" OR "Olympics") AND ("boxing" OR "@Shelley__watts"))'
55
+
56
+ assert_raises do
57
+ result = Boogex.convert(string)[:inclusive_regex]
58
+ end
59
+ end
60
+
61
+ it 'handles multiple brackets' do
62
+ string = '((@officialcsa OR "south africa" OR "south african") AND ("cricket" OR "proteas") AND ("national" OR "international") AND ("team"))'
63
+ expecting = "AND(['(?:@officialcsa|south africa|south african)','(?:cricket|proteas)','(?:national|international)','(?:team)'])"
64
+
65
+ result = Boogex.convert(string)[:inclusive_regex]
43
66
  assert_equal expecting, result
44
67
  end
45
68
  end
@@ -1,11 +1,5 @@
1
1
  <?xml version="1.0" encoding="UTF-8"?>
2
- <testsuite name="Boogex" skipped="0" failures="0" errors="0" tests="4" assertions="4" time="0.0007573158945888281">
3
- <testcase name="test_0001_correctly convert this Lucene boolean query string" classname="Boogex" assertions="1" time="0.0001257510157302022">
4
- </testcase>
5
- <testcase name="test_0002_correctly convert this Lucene boolean query string" classname="Boogex" assertions="1" time="0.00011062994599342346">
6
- </testcase>
7
- <testcase name="test_0003_correctly convert this Lucene boolean query string" classname="Boogex" assertions="1" time="0.00020186102483421564">
8
- </testcase>
9
- <testcase name="test_0004_correctly convert this Lucene boolean query string" classname="Boogex" assertions="1" time="0.0003190739080309868">
2
+ <testsuite name="Boogex" skipped="0" failures="0" errors="0" tests="1" assertions="1" time="0.0007526259869337082">
3
+ <testcase name="test_0001_handles unclosed brackets" classname="Boogex" assertions="1" time="0.0007526259869337082">
10
4
  </testcase>
11
5
  </testsuite>
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: boogex
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sam Crouch