boogex 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 83f81d847ac61e973b5d4db503c9f03632308c9e
4
- data.tar.gz: 09d2def1e45e8bd365b67f94f0157072c3ecfb31
3
+ metadata.gz: ceb29f8b52c10eb03a3f38dd239e310c9a5a3a3f
4
+ data.tar.gz: 99547f836bbc224c92989dc0c24aecdf2ddc6344
5
5
  SHA512:
6
- metadata.gz: 525ffc546863d4719e249a92b6939b44e3d716504dc53e776ddc11234090a94bb000638973233b234b7a597f97cbc4289e63a103d4e82c7bb7cf2782674800f8
7
- data.tar.gz: b6ef7c7cba2d1948080e5dba10610f9b5da545b590253a67b2c7d3cf9fea5cb8e25b3f2cd0b33679a6ebade54ce06a230e69c859fcddbcc36c70500a10da7266
6
+ metadata.gz: c6401a4209df8e76f32598be839ece5982ec89ee555d210e9414b1ff1dace6a59644144627109d0d03eb05167e23ca4540eda7d1bc0de9f3df39126232837025
7
+ data.tar.gz: 7ff03e8740454cb1df6c30fc2d9ec036b18cf244a439e58b9f5a96739428f9dde11ad8b2e40650d37a62a9918e75a9c6e45e162e018d25ca79777972c77017cf
data/.gitignore CHANGED
@@ -1,3 +1 @@
1
- **.DS_Store
2
- **.gem
3
-
1
+ **/.DS_Store
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- boogex (0.1.0)
4
+ boogex (0.1.1)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -19,8 +19,14 @@ Gem::Specification.new do |s|
19
19
  |Rakefile
20
20
  |\.gitignore
21
21
  |\.rubocop.yml
22
+ |.*\.gem
23
+ |.*\.gemspec
24
+ )$}x
25
+ end
26
+ s.test_files = `git ls-files`.split($RS).reject do |file|
27
+ file =~ %r{^(?:
28
+ |.*\.gem
22
29
  )$}x
23
30
  end
24
- s.test_files = `git ls-files`.split($RS)
25
31
  s.require_paths = ['lib']
26
32
  end
@@ -1,8 +1,34 @@
1
1
  # encoding: utf-8
2
2
 
3
3
  module Boogex
4
+ AND_REGEX = / AND /
5
+ OR_REGEX = / OR /
6
+ NOT_REGEX = / NOT /
4
7
  def self.convert(text)
5
- puts "Converting \"#{text}\" into regex"
8
+ texts = text.split(NOT_REGEX)
9
+ fail "The regex '#{text}' split more than twice on 'NOT'" if texts.size > 2
10
+ inclu_text = texts[0]
11
+ exclu_text = texts[1]
12
+ regex_hash = {
13
+ inclusive_regex: run_through_convertors(inclu_text)
14
+ }
15
+
16
+ unless exclu_text.nil?
17
+ regex_hash[:exclusive_regex] = run_through_convertors(exclu_text)
18
+ regex_hash[:no_links] = true if exclu_text.include?('HTTP')
19
+ validate_regex_syntax!(regex_hash[:exclusive_regex], text)
20
+ end
21
+
22
+ validate_regex_syntax!(regex_hash[:inclusive_regex], text)
23
+ regex_hash
24
+ end
25
+
26
+ def self.validate_regex_syntax!(regex, text)
27
+ # Note: This also checks that the regex is valid and returns RegExpError if it isn't including a description of what went wrong.
28
+ fail "#{regex} matched on nothing or empty space. Huh?" if !' '.match(regex).nil?
29
+ end
30
+
31
+ def self.run_through_convertors(text)
6
32
  array = array_struct(text)
7
33
  array = ors_to_pipes(array)
8
34
  array = regex_formatting(array)
@@ -17,12 +43,12 @@ module Boogex
17
43
  # "a OR (b AND (c OR d)) OR e" => ["a OR ", ["b AND ", ["c OR d"]], " OR e"]
18
44
  def self.array_struct(text)
19
45
  inside_brackets = "[^\(\)]*"
46
+ not_open_bracket = "[^\(]*"
20
47
 
21
48
  #This regex looks for anything in brackets OR anything with brackets in brackets OR anything with brackets in brackets in brackets
22
- regex = /(\(#{inside_brackets}\))|(\(#{inside_brackets}\(#{inside_brackets}\)#{inside_brackets}\))|(\(#{inside_brackets}\(#{inside_brackets}\)[^\(]*\(#{inside_brackets}\)#{inside_brackets}\))|(\(#{inside_brackets}\(#{inside_brackets}\(#{inside_brackets}\)#{inside_brackets}\)#{inside_brackets}\))/
49
+ regex = Regexp.new(get_bracket_regex)
23
50
 
24
51
  cuts = text.scan(regex).to_a.flatten.reject(&:nil?)
25
-
26
52
  # If nothing found then return orignal text
27
53
  return text if cuts.empty?
28
54
 
@@ -34,18 +60,32 @@ module Boogex
34
60
  result << str
35
61
  else
36
62
  splits = str.split(cut)
37
-
38
- result << splits.first
39
- cut_without_brackets = cut[1..-2]
40
- result << [cut_without_brackets]
41
- result << splits.last
63
+ if splits.size == 2
64
+ result << splits.first
65
+ cut_without_brackets = cut[1..-2]
66
+ result << [cut_without_brackets]
67
+ result << splits.last
68
+ elsif splits.size == 1 && str.index(splits[0]) == 0
69
+ result << splits[0]
70
+ cut_without_brackets = cut[1..-2]
71
+ result << [cut_without_brackets]
72
+ elsif splits.size == 1 && str.index(splits[0]) > 0
73
+ cut_without_brackets = cut[1..-2]
74
+ result << [cut_without_brackets]
75
+ result << splits[0]
76
+ elsif splits.size == 0
77
+ cut_without_brackets = cut[1..-2]
78
+ result << [cut_without_brackets]
79
+ else
80
+ fail "This should never happen"
81
+ end
42
82
  end
43
83
  end
44
- end.uniq.compact
84
+ end.compact
45
85
 
46
86
  # This recursively converts any brackets in the text back into the array_struct function
47
87
  # where the upper limit of recursion is 3 levels of bracketing. This is limitied by the regex
48
- # on line 9 but can easily be extended.
88
+ # on line 9 but can be extended.
49
89
  # If the element of the array is a string then no recursion to apply.
50
90
  # If the element of the array is an array then iterate THAT through the array_struct function
51
91
  text_array.reject(&:empty?).each_with_object([]) do |str, result|
@@ -62,7 +102,9 @@ module Boogex
62
102
 
63
103
  # This function converts the Lucene Boolean `OR` into regex `|` and removes any quotation marks
64
104
  def self.ors_to_pipes(obj)
65
- return obj.gsub(' OR ', '|').gsub('"', '').gsub("'", '') if obj.is_a?(String)
105
+ return obj.gsub(OR_REGEX, '|').gsub('"', '').gsub(/\-(?=([^\[]*\[[^\]]*\])*[^\[\]]*$)/, '\-').gsub("'", '') if obj.is_a?(String)
106
+
107
+ raise "There are unclosed brackets in this boolean string" if has_unclosed_brackets?(obj)
66
108
 
67
109
  # This recursively applies this function to ensure all levels of the array are converted
68
110
  obj.collect do |text|
@@ -70,6 +112,12 @@ module Boogex
70
112
  end
71
113
  end
72
114
 
115
+ def self.has_unclosed_brackets?(obj)
116
+ obj.any? do |o|
117
+ o.count('(') != o.count(')')
118
+ end
119
+ end
120
+
73
121
  # This function begins to tranform the elements of the array structure to regex formatting
74
122
  # including:
75
123
  # - (a) Any elements that are not bookended by | are then wrapped in (?:) as this modularises
@@ -85,7 +133,7 @@ module Boogex
85
133
  if obj.is_a?(String)
86
134
  if contain_AND?(obj)
87
135
  result = ['AND']
88
- result = result + obj.split(' AND ').reject(&:empty?).collect do |str|
136
+ result = result + obj.split(AND_REGEX).reject(&:empty?).collect do |str|
89
137
  regex_formatting(str)
90
138
  end
91
139
  return result
@@ -101,7 +149,7 @@ module Boogex
101
149
  contain_AND?(str)
102
150
  end
103
151
  result = obj.each_with_object(['AND']) do |str, arr|
104
- str.split(' AND ').reject(&:empty?).collect do |str|
152
+ str.split(AND_REGEX).reject(&:empty?).collect do |str|
105
153
  arr << regex_formatting(str)
106
154
  end
107
155
  end
@@ -125,7 +173,7 @@ module Boogex
125
173
 
126
174
  obj.each_with_object(result) do |text, result|
127
175
  if contain_AND?(text)
128
- text.split(' AND ').reject(&:empty?).each do |str|
176
+ text.split(AND_REGEX).reject(&:empty?).each do |str|
129
177
  result << regex_formatting(str)
130
178
  end
131
179
  else
@@ -168,7 +216,7 @@ module Boogex
168
216
  end
169
217
 
170
218
  def self.contain_AND?(obj)
171
- obj.is_a?(String) && obj.include?(' AND ')
219
+ obj.is_a?(String) && obj.match(AND_REGEX)
172
220
  end
173
221
 
174
222
  def self.all_strings?(array)
@@ -189,4 +237,69 @@ module Boogex
189
237
  def self.wrap_in_brackets(text)
190
238
  '(?:' + text + ')'
191
239
  end
240
+
241
+ def self.get_bracket_regex
242
+ @bracket_regex ||= generate_brack_regex
243
+ end
244
+
245
+ # This function generates the bracket regex. For simplicity, the regex for 'inside of a bracket' is represented by the
246
+ # string 'a', and the regex outside of a bracket is represented by the string 'b'. These are then substituted out at the end.
247
+
248
+ def self.generate_brack_regex
249
+ puts "Loading bracket regex..."
250
+ inside_brackets = "[^\(\)]*"
251
+ not_open_bracket = "[^\(]*"
252
+
253
+ get_bracket_inputs.collect do |input|
254
+ bracket_input_to_brackets(input).gsub('a', inside_brackets).gsub('b', not_open_bracket)
255
+ end.join('|')
256
+ end
257
+
258
+ # This function loads the valid permutations of the bracket regex where 0 represents an open bracket and 1 means closed bracket.
259
+ # All poosible permutations of bracket ordering are generated and then only valid bracket orderings are selected.
260
+ def self.get_bracket_inputs
261
+ inputs = []
262
+ (0..2000).to_a.each do |v|
263
+ result = v.to_s(2).split('').collect(&:to_i)
264
+ inputs << result
265
+ inputs << result.reverse unless result == result.reverse
266
+ inputs.uniq!
267
+ end
268
+
269
+ inputs.select { |v| valid?(v) }
270
+ end
271
+
272
+ def self.valid?(input)
273
+ # The total count of brackets must be even.
274
+ return false unless input.size.even?
275
+
276
+ # Only have 0's or 1's as inputs
277
+ return false if input.any? { |v| ![0, 1].include?(v) }
278
+
279
+ # The number of open brackets must equal the number of closed brackets
280
+ return false unless input.inject(0) { |n, v| n + v } == input.size / 2
281
+
282
+ # Can't start with a close bracket or end with an open bracket
283
+ return false if input.first == 1 || input.last == 0
284
+
285
+ sum = 0
286
+ valid = true
287
+ input.each_with_index do |v, i|
288
+ sum += v
289
+ comparison = (i + 1) / 2
290
+ valid = false if sum > comparison
291
+ end
292
+ valid
293
+ end
294
+
295
+ # This function first converts the sequence of 0's and 1's to open and close brackets.
296
+ # It then puts in a 'b' string between any close brackets that are followed by an open bracket.
297
+ # Finally it compresses any consecutive 'a's into a single 'a' as they are idempotent(ie. aaa == a).
298
+
299
+ def self.bracket_input_to_brackets(input)
300
+ brackets = ["\\(a", "a\\)"]
301
+ input.collect do |i|
302
+ brackets[i]
303
+ end.join('').gsub('\\)\\(', '\\)b\\(').gsub(/a+/, 'a')
304
+ end
192
305
  end
@@ -1,4 +1,4 @@
1
1
  # encoding: utf-8
2
2
  module Boogex
3
- VERSION = '0.1.1'
3
+ VERSION = '0.1.2'
4
4
  end
@@ -4,42 +4,65 @@ describe Boogex do
4
4
  it 'turns OR into |' do
5
5
  string = 'This OR That'
6
6
  expecting = '(?:This|That)'
7
- result = Boogex.convert(string)
7
+ result = Boogex.convert(string)[:inclusive_regex]
8
8
  assert_equal expecting, result
9
9
  end
10
10
 
11
11
  it 'turns boolean AND into AND array string' do
12
12
  string = 'This AND That'
13
13
  expecting = "AND(['(?:This)','(?:That)'])"
14
- result = Boogex.convert(string)
14
+ result = Boogex.convert(string)[:inclusive_regex]
15
15
  assert_equal expecting, result
16
16
  end
17
17
 
18
18
  it 'understands bracketing' do
19
19
  string = '(This OR That) AND My self'
20
20
  expecting = "AND(['(?:This|That)','(?:My self)'])"
21
- result = Boogex.convert(string)
21
+ result = Boogex.convert(string)[:inclusive_regex]
22
22
  assert_equal expecting, result
23
23
  end
24
24
 
25
25
  it 'correctly convert this Lucene boolean query string' do
26
26
  string = '(((asd OR dd) AND that) AND this) OR What?'
27
27
  expecting = "AND([AND(['(?:asd|dd)','(?:that)']),'(?:this)'])|What?"
28
- result = Boogex.convert(string)
28
+ result = Boogex.convert(string)[:inclusive_regex]
29
29
  assert_equal expecting, result
30
30
  end
31
31
 
32
32
  it 'understands embedded AND' do
33
33
  string = '((im AND researching) AND travelling)'
34
34
  expecting = "AND([AND(['(?:im)','(?:researching)']),'(?:travelling)'])"
35
- result = Boogex.convert(string)
35
+ result = Boogex.convert(string)[:inclusive_regex]
36
36
  assert_equal expecting, result
37
37
  end
38
38
 
39
39
  it 'understands double embedded AND' do
40
40
  string = 'Peeps OR ((dude OR roos) AND (what OR Footy))'
41
41
  expecting = "Peeps|AND(['(?:dude|roos)','(?:what|Footy)'])"
42
- result = Boogex.convert(string)
42
+ result = Boogex.convert(string)[:inclusive_regex]
43
+ assert_equal expecting, result
44
+ end
45
+
46
+ it 'escapes hyphens' do
47
+ string = '("John Gordon-Smith" OR "hair") AND ("none" OR "all")'
48
+ expecting = "AND(['(?:John Gordon\\-Smith|hair)','(?:none|all)'])"
49
+ result = Boogex.convert(string)[:inclusive_regex]
50
+ assert_equal expecting, result
51
+ end
52
+
53
+ it 'handles unclosed brackets' do
54
+ string = '("@AUSOlympicTeam" OR "Olympics") AND ("boxing" OR "@Shelley__watts"))'
55
+
56
+ assert_raises do
57
+ result = Boogex.convert(string)[:inclusive_regex]
58
+ end
59
+ end
60
+
61
+ it 'handles multiple brackets' do
62
+ string = '((@officialcsa OR "south africa" OR "south african") AND ("cricket" OR "proteas") AND ("national" OR "international") AND ("team"))'
63
+ expecting = "AND(['(?:@officialcsa|south africa|south african)','(?:cricket|proteas)','(?:national|international)','(?:team)'])"
64
+
65
+ result = Boogex.convert(string)[:inclusive_regex]
43
66
  assert_equal expecting, result
44
67
  end
45
68
  end
@@ -1,11 +1,5 @@
1
1
  <?xml version="1.0" encoding="UTF-8"?>
2
- <testsuite name="Boogex" skipped="0" failures="0" errors="0" tests="4" assertions="4" time="0.0007573158945888281">
3
- <testcase name="test_0001_correctly convert this Lucene boolean query string" classname="Boogex" assertions="1" time="0.0001257510157302022">
4
- </testcase>
5
- <testcase name="test_0002_correctly convert this Lucene boolean query string" classname="Boogex" assertions="1" time="0.00011062994599342346">
6
- </testcase>
7
- <testcase name="test_0003_correctly convert this Lucene boolean query string" classname="Boogex" assertions="1" time="0.00020186102483421564">
8
- </testcase>
9
- <testcase name="test_0004_correctly convert this Lucene boolean query string" classname="Boogex" assertions="1" time="0.0003190739080309868">
2
+ <testsuite name="Boogex" skipped="0" failures="0" errors="0" tests="1" assertions="1" time="0.0007526259869337082">
3
+ <testcase name="test_0001_handles unclosed brackets" classname="Boogex" assertions="1" time="0.0007526259869337082">
10
4
  </testcase>
11
5
  </testsuite>
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: boogex
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sam Crouch