solr_search_sanitizer 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +8 -0
- data/lib/solr_search_sanitizer/sanitizer.rb +121 -0
- data/solr_search_sanitizer.gemspec +19 -0
- metadata +58 -0
data/README.md
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
SolrSearchSanitizer
|
2
|
+
===================
|
3
|
+
|
4
|
+
A gem to provide a way of sanitizing SOLR searches containing advanced search characters and nested parentheses. This gem assumes that ECommerce stores do not advertise this advanced functionality of SOLR to users and so the query must be sanitized to prevent incorrect syntax.
|
5
|
+
|
6
|
+
Copyright (c) 2011 minustehbare, released under the New BSD License
|
7
|
+
|
8
|
+
|
@@ -0,0 +1,121 @@
|
|
1
|
+
module SolrSearchSanitizer
|
2
|
+
module Sanitizer
|
3
|
+
BOOLEAN_OPERATORS_REGEXP = /\b(AND)\b|\b(OR)\b|\b(NOT)\b|\B(&&)\B|\B(!)\B|\B(\|\|)\B/
|
4
|
+
BRACKET_REGEXP = /((\()|(\))|(\{)|(\})|(\[)|(\]))/
|
5
|
+
WILDCARD_REGEXP = /((\*)|(\?))/
|
6
|
+
#FUZZY_REGEXP = /[\w|\"](~)[\d]?/
|
7
|
+
FUZZY_REGEXP = /(~)/
|
8
|
+
#BOOST_REGEXP = /[\w|\"](\^)[\d]?/
|
9
|
+
BOOST_REGEXP = /(\^)/
|
10
|
+
#BOOLEAN_MODIFIER_REGEXP = /((\+)|(-))[(\w+(\b))|(\".+\")]/
|
11
|
+
BOOLEAN_MODIFIER_REGEXP = /((\+)|(-))/
|
12
|
+
MISC_REGEXP = /(\")|(:)/
|
13
|
+
|
14
|
+
####################
|
15
|
+
#
|
16
|
+
# All REGEXP definitions match characters or keywords that are part of the Apache Lucene Query Parser Sytntax
|
17
|
+
#
|
18
|
+
# BOOLEAN_OPERATOR_REGEXP => matches boolean operators on multiple search terms
|
19
|
+
#
|
20
|
+
# BRACKET_REGEXP => matches parentheses and brackets and braces that group search clauses or define range searches
|
21
|
+
#
|
22
|
+
# WILDCARD_REGEXP => matches ? or * characters in the search that act as single or multiple character wildcards within a search term
|
23
|
+
#
|
24
|
+
# FUZZY_REGEXP => matches the ~ character at the end of a search term or phrase
|
25
|
+
#
|
26
|
+
# BOOST_REGEXP => matches the ^ character at the end of a search term or phrase
|
27
|
+
#
|
28
|
+
# BOOLEAN_MODIFIER_REGEXP => matches + or - characters at the start of search terms/phrases
|
29
|
+
#
|
30
|
+
# MISC_REGEXP => matches (currently) : and " characters that are used for defining phrases or field values
|
31
|
+
#
|
32
|
+
####################
|
33
|
+
|
34
|
+
def escape_boolean_operators(query)
|
35
|
+
return nil unless query
|
36
|
+
new_query = query.gsub(BOOLEAN_OPERATORS_REGEXP, '&&' => '\\&\\&', '||' => '\\|\\|', '!' => '\\!', 'AND' => 'and', 'NOT' => 'not', 'OR' => 'or')
|
37
|
+
end
|
38
|
+
|
39
|
+
def remove_boolean_operators(query)
|
40
|
+
return nil unless query
|
41
|
+
new_query = query.gsub(BOOLEAN_OPERATORS_REGEXP, '')
|
42
|
+
end
|
43
|
+
|
44
|
+
def escape_brackets(query)
|
45
|
+
return nil unless query
|
46
|
+
new_query = query.gsub(BRACKET_REGEXP, '\\\\\1')
|
47
|
+
end
|
48
|
+
|
49
|
+
def remove_brackets(query)
|
50
|
+
return nil unless query
|
51
|
+
new_query = query.gsub(BRACKET_REGEXP, '')
|
52
|
+
end
|
53
|
+
|
54
|
+
def escape_wildcards(query)
|
55
|
+
return nil unless query
|
56
|
+
new_query = query.gsub(WILDCARD_REGEXP, '\\\\\1')
|
57
|
+
end
|
58
|
+
|
59
|
+
def remove_wildcards(query)
|
60
|
+
return nil unless query
|
61
|
+
new_query = query.gsub(WILDCARD_REGEXP, '')
|
62
|
+
end
|
63
|
+
|
64
|
+
def escape_fuzzy(query)
|
65
|
+
return nil unless query
|
66
|
+
new_query = query.gsub(FUZZY_REGEXP, '\\\\\1')
|
67
|
+
end
|
68
|
+
|
69
|
+
def remove_fuzzy(query)
|
70
|
+
return nil unless query
|
71
|
+
new_query = query.gsub(FUZZY_REGEXP, '')
|
72
|
+
end
|
73
|
+
|
74
|
+
def escape_boost(query)
|
75
|
+
return nil unless query
|
76
|
+
new_query = query.gsub(BOOST_REGEXP, '\\\\\1')
|
77
|
+
end
|
78
|
+
|
79
|
+
def remove_boost(query)
|
80
|
+
return nil unless query
|
81
|
+
new_query = query.gsub(BOOST_REGEXP, '')
|
82
|
+
end
|
83
|
+
|
84
|
+
def escape_boolean_modifiers(query)
|
85
|
+
return nil unless query
|
86
|
+
new_query = query.gsub(BOOLEAN_MODIFIER_REGEXP, '\\\\\1')
|
87
|
+
end
|
88
|
+
|
89
|
+
def remove_boolean_modifiers(query)
|
90
|
+
return nil unless query
|
91
|
+
new_query = query.gsub(BOOLEAN_MODIFIER_REGEXP, '')
|
92
|
+
end
|
93
|
+
|
94
|
+
# : (colon) characters in SOLR searches are modified by acts_as_solr_reloaded
|
95
|
+
# after they are escaped
|
96
|
+
#
|
97
|
+
# without sanitization, some_field:some_value => some_field_t:some_value from acts_as_solr
|
98
|
+
# with sanitization, some_field:some_value => some_field\_t:some_value
|
99
|
+
#
|
100
|
+
# for this reason we simply remove the : from the search entirely since it cannot be escaped
|
101
|
+
def escape_misc(query)
|
102
|
+
return nil unless query
|
103
|
+
new_query = query.gsub(MISC_REGEXP, '"' => '\\"', ':' => '')
|
104
|
+
end
|
105
|
+
|
106
|
+
def remove_misc(query)
|
107
|
+
return nil unless query
|
108
|
+
new_query = query.gsub(MISC_REGEXP, '')
|
109
|
+
end
|
110
|
+
|
111
|
+
def escape_special_characters(query)
|
112
|
+
new_query = escape_boolean_operators(query)
|
113
|
+
new_query = escape_brackets(new_query)
|
114
|
+
new_query = escape_wildcards(new_query)
|
115
|
+
new_query = escape_fuzzy(new_query)
|
116
|
+
new_query = escape_boost(new_query)
|
117
|
+
new_query = escape_boolean_modifiers(new_query)
|
118
|
+
new_query = escape_misc(new_query)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.platform = Gem::Platform::RUBY
|
3
|
+
s.name = 'solr_search_sanitizer'
|
4
|
+
s.version = '0.1.0'
|
5
|
+
s.summary = 'Provides sanitizer methods for SOLR queries.'
|
6
|
+
s.description = 'Provides sanitizer methods for SOLR queries.'
|
7
|
+
s.required_ruby_version = '>= 1.8.7'
|
8
|
+
|
9
|
+
s.author = 'minustehbare'
|
10
|
+
s.email = 'minustehbare@gmail.com'
|
11
|
+
# s.homepage = 'http://www.rubyonrails.org'
|
12
|
+
# s.rubyforge_project = 'actionmailer'
|
13
|
+
|
14
|
+
s.files = `git ls-files`.split("\n")
|
15
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
16
|
+
s.require_path = 'lib'
|
17
|
+
s.requirements << 'none'
|
18
|
+
|
19
|
+
end
|
metadata
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: solr_search_sanitizer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.1.0
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- minustehbare
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-07-14 00:00:00 -04:00
|
14
|
+
default_executable:
|
15
|
+
dependencies: []
|
16
|
+
|
17
|
+
description: Provides sanitizer methods for SOLR queries.
|
18
|
+
email: minustehbare@gmail.com
|
19
|
+
executables: []
|
20
|
+
|
21
|
+
extensions: []
|
22
|
+
|
23
|
+
extra_rdoc_files: []
|
24
|
+
|
25
|
+
files:
|
26
|
+
- README.md
|
27
|
+
- lib/solr_search_sanitizer/sanitizer.rb
|
28
|
+
- solr_search_sanitizer.gemspec
|
29
|
+
has_rdoc: true
|
30
|
+
homepage:
|
31
|
+
licenses: []
|
32
|
+
|
33
|
+
post_install_message:
|
34
|
+
rdoc_options: []
|
35
|
+
|
36
|
+
require_paths:
|
37
|
+
- lib
|
38
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 1.8.7
|
44
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
45
|
+
none: false
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: "0"
|
50
|
+
requirements:
|
51
|
+
- none
|
52
|
+
rubyforge_project:
|
53
|
+
rubygems_version: 1.6.2
|
54
|
+
signing_key:
|
55
|
+
specification_version: 3
|
56
|
+
summary: Provides sanitizer methods for SOLR queries.
|
57
|
+
test_files: []
|
58
|
+
|