RubyGems - boogex - Versions diffs - 0.1.1 → 0.1.2 - Mend

boogex 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 83f81d847ac61e973b5d4db503c9f03632308c9e
-  data.tar.gz: 09d2def1e45e8bd365b67f94f0157072c3ecfb31
+  metadata.gz: ceb29f8b52c10eb03a3f38dd239e310c9a5a3a3f
+  data.tar.gz: 99547f836bbc224c92989dc0c24aecdf2ddc6344
 SHA512:
-  metadata.gz: 525ffc546863d4719e249a92b6939b44e3d716504dc53e776ddc11234090a94bb000638973233b234b7a597f97cbc4289e63a103d4e82c7bb7cf2782674800f8
-  data.tar.gz: b6ef7c7cba2d1948080e5dba10610f9b5da545b590253a67b2c7d3cf9fea5cb8e25b3f2cd0b33679a6ebade54ce06a230e69c859fcddbcc36c70500a10da7266
+  metadata.gz: c6401a4209df8e76f32598be839ece5982ec89ee555d210e9414b1ff1dace6a59644144627109d0d03eb05167e23ca4540eda7d1bc0de9f3df39126232837025
+  data.tar.gz: 7ff03e8740454cb1df6c30fc2d9ec036b18cf244a439e58b9f5a96739428f9dde11ad8b2e40650d37a62a9918e75a9c6e45e162e018d25ca79777972c77017cf

data/.gitignore CHANGED

@@ -1,3 +1 @@
-**.DS_Store
-**.gem
+**/.DS_Store

data/Gemfile.lock CHANGED

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    boogex (0.1.0)
+    boogex (0.1.1)
 GEM
   remote: https://rubygems.org/

data/boogex.gemspec CHANGED

@@ -19,8 +19,14 @@ Gem::Specification.new do |s|
     |Rakefile
     |\.gitignore
     |\.rubocop.yml
+    |.*\.gem
+    |.*\.gemspec
+    )$}x
+  end
+  s.test_files    = `git ls-files`.split($RS).reject do |file|
+    file =~ %r{^(?:
+    |.*\.gem
     )$}x
   end
-  s.test_files    = `git ls-files`.split($RS)
   s.require_paths = ['lib']
 end

data/lib/boogex/convertor.rb CHANGED

@@ -1,8 +1,34 @@
 # encoding: utf-8
 module Boogex
+  AND_REGEX = / AND /
+  OR_REGEX = / OR /
+  NOT_REGEX = / NOT /
   def self.convert(text)
-    puts "Converting \"#{text}\" into regex"
+    texts = text.split(NOT_REGEX)
+    fail "The regex '#{text}' split more than twice on 'NOT'" if texts.size > 2
+    inclu_text = texts[0]
+    exclu_text = texts[1]
+    regex_hash = {
+      inclusive_regex: run_through_convertors(inclu_text)
+    }
+    unless exclu_text.nil?
+      regex_hash[:exclusive_regex] = run_through_convertors(exclu_text)
+      regex_hash[:no_links] = true if exclu_text.include?('HTTP')
+      validate_regex_syntax!(regex_hash[:exclusive_regex], text)
+    end
+    validate_regex_syntax!(regex_hash[:inclusive_regex], text)
+    regex_hash
+  end
+  def self.validate_regex_syntax!(regex, text)
+    # Note: This also checks that the regex is valid and returns RegExpError if it isn't including a description of what went wrong.
+    fail "#{regex} matched on nothing or empty space. Huh?" if !' '.match(regex).nil?
+  end
+  def self.run_through_convertors(text)
     array = array_struct(text)
     array = ors_to_pipes(array)
     array = regex_formatting(array)
@@ -17,12 +43,12 @@ module Boogex
   # "a OR (b AND (c OR d)) OR e" => ["a OR ", ["b AND ", ["c OR d"]], " OR e"]
   def self.array_struct(text)
     inside_brackets = "[^\(\)]*"
+    not_open_bracket = "[^\(]*"
     #This regex looks for anything in brackets OR anything with brackets in brackets OR anything with brackets in brackets in brackets
-    regex = /(\(#{inside_brackets}\))|(\(#{inside_brackets}\(#{inside_brackets}\)#{inside_brackets}\))|(\(#{inside_brackets}\(#{inside_brackets}\)[^\(]*\(#{inside_brackets}\)#{inside_brackets}\))|(\(#{inside_brackets}\(#{inside_brackets}\(#{inside_brackets}\)#{inside_brackets}\)#{inside_brackets}\))/
+    regex = Regexp.new(get_bracket_regex)
     cuts = text.scan(regex).to_a.flatten.reject(&:nil?)
     # If nothing found then return orignal text
     return text if cuts.empty?
@@ -34,18 +60,32 @@ module Boogex
           result << str
         else
           splits = str.split(cut)
-          result << splits.first
-          cut_without_brackets = cut[1..-2]
-          result << [cut_without_brackets]
-          result << splits.last
+          if splits.size == 2
+            result << splits.first
+            cut_without_brackets = cut[1..-2]
+            result << [cut_without_brackets]
+            result << splits.last
+          elsif splits.size == 1 && str.index(splits[0]) == 0
+            result << splits[0]
+            cut_without_brackets = cut[1..-2]
+            result << [cut_without_brackets]
+          elsif splits.size == 1 && str.index(splits[0]) > 0
+            cut_without_brackets = cut[1..-2]
+            result << [cut_without_brackets]
+            result << splits[0]
+          elsif splits.size == 0
+            cut_without_brackets = cut[1..-2]
+            result << [cut_without_brackets]
+          else
+            fail "This should never happen"
+          end
         end
       end
-    end.uniq.compact
+    end.compact
     # This recursively converts any brackets in the text back into the array_struct function
     # where the upper limit of recursion is 3 levels of bracketing. This is limitied by the regex
-    # on line 9 but can easily be extended.
+    # on line 9 but can be extended.
     # If the element of the array is a string then no recursion to apply.
     # If the element of the array is an array then iterate THAT through the array_struct function
     text_array.reject(&:empty?).each_with_object([]) do |str, result|
@@ -62,7 +102,9 @@ module Boogex
   # This function converts the Lucene Boolean `OR` into regex `|` and removes any quotation marks
   def self.ors_to_pipes(obj)
-    return obj.gsub(' OR ', '|').gsub('"', '').gsub("'", '') if obj.is_a?(String)
+    return obj.gsub(OR_REGEX, '|').gsub('"', '').gsub(/\-(?=([^\[]*\[[^\]]*\])*[^\[\]]*$)/, '\-').gsub("'", '') if obj.is_a?(String)
+    raise "There are unclosed brackets in this boolean string" if has_unclosed_brackets?(obj)
     # This recursively applies this function to ensure all levels of the array are converted
     obj.collect do |text|
@@ -70,6 +112,12 @@ module Boogex
     end
   end
+  def self.has_unclosed_brackets?(obj)
+    obj.any? do |o|
+      o.count('(') != o.count(')')
+    end
+  end
   # This function begins to tranform the elements of the array structure to regex formatting
   # including:
   # - (a) Any elements that are not bookended by | are then wrapped in (?:) as this modularises
@@ -85,7 +133,7 @@ module Boogex
     if obj.is_a?(String)
       if contain_AND?(obj)
         result = ['AND']
-        result = result + obj.split(' AND ').reject(&:empty?).collect do |str|
+        result = result + obj.split(AND_REGEX).reject(&:empty?).collect do |str|
           regex_formatting(str)
         end
         return result
@@ -101,7 +149,7 @@ module Boogex
         contain_AND?(str)
       end
         result = obj.each_with_object(['AND']) do |str, arr|
-          str.split(' AND ').reject(&:empty?).collect do |str|
+          str.split(AND_REGEX).reject(&:empty?).collect do |str|
             arr << regex_formatting(str)
           end
         end
@@ -125,7 +173,7 @@ module Boogex
     obj.each_with_object(result) do |text, result|
       if contain_AND?(text)
-        text.split(' AND ').reject(&:empty?).each do |str|
+        text.split(AND_REGEX).reject(&:empty?).each do |str|
           result << regex_formatting(str)
         end
       else
@@ -168,7 +216,7 @@ module Boogex
   end
   def self.contain_AND?(obj)
-    obj.is_a?(String) && obj.include?(' AND ')
+    obj.is_a?(String) && obj.match(AND_REGEX)
   end
   def self.all_strings?(array)
@@ -189,4 +237,69 @@ module Boogex
   def self.wrap_in_brackets(text)
     '(?:' + text + ')'
   end
+  def self.get_bracket_regex
+    @bracket_regex ||= generate_brack_regex
+  end
+  # This function generates the bracket regex. For simplicity, the regex for 'inside of a bracket' is represented by the
+  # string 'a', and the regex outside of a bracket is represented by the string 'b'. These are then substituted out at the end.
+  def self.generate_brack_regex
+    puts "Loading bracket regex..."
+    inside_brackets = "[^\(\)]*"
+    not_open_bracket = "[^\(]*"
+    get_bracket_inputs.collect do |input|
+      bracket_input_to_brackets(input).gsub('a', inside_brackets).gsub('b', not_open_bracket)
+    end.join('|')
+  end
+  # This function loads the valid permutations of the bracket regex where 0 represents an open bracket and 1 means closed bracket.
+  # All poosible permutations of bracket ordering are generated and then only valid bracket orderings are selected.
+  def self.get_bracket_inputs
+    inputs = []
+    (0..2000).to_a.each do |v|
+      result = v.to_s(2).split('').collect(&:to_i)
+      inputs << result
+      inputs << result.reverse unless result == result.reverse
+      inputs.uniq!
+    end
+    inputs.select { |v| valid?(v) }
+  end
+  def self.valid?(input)
+    # The total count of brackets must be even.
+    return false unless input.size.even?
+    # Only have 0's or 1's as inputs
+    return false if input.any? { |v| ![0, 1].include?(v) }
+    # The number of open brackets must equal the number of closed brackets
+    return false unless input.inject(0) { |n, v| n + v } == input.size / 2
+    # Can't start with a close bracket or end with an open bracket
+    return false if input.first == 1 || input.last == 0
+    sum = 0
+    valid = true
+    input.each_with_index do |v, i|
+      sum += v
+      comparison = (i + 1) / 2
+      valid = false if sum > comparison
+    end
+    valid
+  end
+  # This function first converts the sequence of 0's and 1's to open and close brackets.
+  # It then puts in a 'b' string between any close brackets that are followed by an open bracket.
+  # Finally it compresses any consecutive 'a's into a single 'a' as they are idempotent(ie. aaa == a).
+  def self.bracket_input_to_brackets(input)
+    brackets = ["\\(a", "a\\)"]
+    input.collect do |i|
+      brackets[i]
+    end.join('').gsub('\\)\\(', '\\)b\\(').gsub(/a+/, 'a')
+  end
 end

data/lib/boogex/version.rb CHANGED

@@ -1,4 +1,4 @@
 # encoding: utf-8
 module Boogex
-  VERSION = '0.1.1'
+  VERSION = '0.1.2'
 end

data/test/convertor_test.rb CHANGED

@@ -4,42 +4,65 @@ describe Boogex do
   it 'turns OR into |' do
     string = 'This OR That'
     expecting = '(?:This|That)'
-    result = Boogex.convert(string)
+    result = Boogex.convert(string)[:inclusive_regex]
     assert_equal expecting, result
   end
   it 'turns boolean AND into AND array string' do
     string = 'This AND That'
     expecting = "AND(['(?:This)','(?:That)'])"
-    result = Boogex.convert(string)
+    result = Boogex.convert(string)[:inclusive_regex]
     assert_equal expecting, result
   end
   it 'understands bracketing' do
     string = '(This OR That) AND My self'
     expecting = "AND(['(?:This|That)','(?:My self)'])"
-    result = Boogex.convert(string)
+    result = Boogex.convert(string)[:inclusive_regex]
     assert_equal expecting, result
   end
   it 'correctly convert this Lucene boolean query string' do
     string = '(((asd OR dd) AND that) AND this) OR What?'
     expecting = "AND([AND(['(?:asd|dd)','(?:that)']),'(?:this)'])|What?"
-    result = Boogex.convert(string)
+    result = Boogex.convert(string)[:inclusive_regex]
     assert_equal expecting, result
   end
   it 'understands embedded AND' do
     string = '((im AND researching) AND travelling)'
     expecting = "AND([AND(['(?:im)','(?:researching)']),'(?:travelling)'])"
-    result = Boogex.convert(string)
+    result = Boogex.convert(string)[:inclusive_regex]
     assert_equal expecting, result
   end
   it 'understands double embedded AND' do
     string = 'Peeps OR ((dude OR roos) AND (what OR Footy))'
     expecting = "Peeps|AND(['(?:dude|roos)','(?:what|Footy)'])"
-    result = Boogex.convert(string)
+    result = Boogex.convert(string)[:inclusive_regex]
+    assert_equal expecting, result
+  end
+  it 'escapes hyphens' do
+    string = '("John Gordon-Smith" OR "hair") AND ("none" OR "all")'
+    expecting = "AND(['(?:John Gordon\\-Smith|hair)','(?:none|all)'])"
+    result = Boogex.convert(string)[:inclusive_regex]
+    assert_equal expecting, result
+  end
+  it 'handles unclosed brackets' do
+    string = '("@AUSOlympicTeam" OR "Olympics") AND ("boxing" OR "@Shelley__watts"))'
+    assert_raises do
+      result = Boogex.convert(string)[:inclusive_regex]
+    end
+  end
+  it 'handles multiple brackets' do
+    string = '((@officialcsa OR "south africa" OR "south african") AND ("cricket" OR "proteas") AND ("national" OR "international") AND ("team"))'
+    expecting = "AND(['(?:@officialcsa|south africa|south african)','(?:cricket|proteas)','(?:national|international)','(?:team)'])"
+    result = Boogex.convert(string)[:inclusive_regex]
     assert_equal expecting, result
   end
 end

data/test/reports/TEST-Boogex.xml CHANGED

@@ -1,11 +1,5 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<testsuite name="Boogex" skipped="0" failures="0" errors="0" tests="4" assertions="4" time="0.0007573158945888281">
-  <testcase name="test_0001_correctly convert this Lucene boolean query string" classname="Boogex" assertions="1" time="0.0001257510157302022">
-  </testcase>
-  <testcase name="test_0002_correctly convert this Lucene boolean query string" classname="Boogex" assertions="1" time="0.00011062994599342346">
-  </testcase>
-  <testcase name="test_0003_correctly convert this Lucene boolean query string" classname="Boogex" assertions="1" time="0.00020186102483421564">
-  </testcase>
-  <testcase name="test_0004_correctly convert this Lucene boolean query string" classname="Boogex" assertions="1" time="0.0003190739080309868">
+<testsuite name="Boogex" skipped="0" failures="0" errors="0" tests="1" assertions="1" time="0.0007526259869337082">
+  <testcase name="test_0001_handles unclosed brackets" classname="Boogex" assertions="1" time="0.0007526259869337082">
   </testcase>
 </testsuite>

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: boogex
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.1.2
 platform: ruby
 authors:
 - Sam Crouch