human-ql 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/History.rdoc +2 -0
- data/Manifest.txt +17 -0
- data/README.rdoc +59 -0
- data/Rakefile +14 -0
- data/lib/human-ql.rb +17 -0
- data/lib/human-ql/base.rb +21 -0
- data/lib/human-ql/postgresql_custom_parser.rb +67 -0
- data/lib/human-ql/postgresql_generator.rb +83 -0
- data/lib/human-ql/query_generator.rb +163 -0
- data/lib/human-ql/query_parser.rb +498 -0
- data/lib/human-ql/tree_normalizer.rb +226 -0
- data/test/setup.rb +24 -0
- data/test/test_postgresql_fuzz.rb +90 -0
- data/test/test_postgresql_generator.rb +171 -0
- data/test/test_query_generator.rb +87 -0
- data/test/test_query_parser.rb +306 -0
- data/test/test_tree_normalizer.rb +153 -0
- metadata +143 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 25023c260b71ece677718cfc8aad134860ced434
|
4
|
+
data.tar.gz: f3d8d6e3fad057bd458292b384a3b2f0ed0c5595
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 83492efda22be913723385c49689f65f86f1520b4f0a3c460ff9a57e565715ef71b727ceddff7756f451fdbf9c9bf0d143ea148798ec3bc26ddb2f9472140513
|
7
|
+
data.tar.gz: dcf2c5edb12cb6f2cc38aff7698f89472df203585f93898023f1f7f1c4f15af67a64e66ee75eb596202c6cbb3625c88039e1b5fbeda7a72ca1e41af6c914fd12
|
data/History.rdoc
ADDED
data/Manifest.txt
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
History.rdoc
|
2
|
+
Manifest.txt
|
3
|
+
README.rdoc
|
4
|
+
Rakefile
|
5
|
+
lib/human-ql/base.rb
|
6
|
+
lib/human-ql.rb
|
7
|
+
lib/human-ql/postgresql_custom_parser.rb
|
8
|
+
lib/human-ql/postgresql_generator.rb
|
9
|
+
lib/human-ql/query_generator.rb
|
10
|
+
lib/human-ql/query_parser.rb
|
11
|
+
lib/human-ql/tree_normalizer.rb
|
12
|
+
test/setup.rb
|
13
|
+
test/test_postgresql_fuzz.rb
|
14
|
+
test/test_postgresql_generator.rb
|
15
|
+
test/test_query_generator.rb
|
16
|
+
test/test_query_parser.rb
|
17
|
+
test/test_tree_normalizer.rb
|
data/README.rdoc
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
= HumanQL
|
2
|
+
|
3
|
+
* http://github.com/dekellum/human-ql
|
4
|
+
* http://rdoc.gravitext.com/human-ql/
|
5
|
+
|
6
|
+
== Description
|
7
|
+
|
8
|
+
Human Query Language for full text search engines. Provides a lenient
|
9
|
+
parser and associated tools for a self-contained and search-engine
|
10
|
+
agnostic query language suitable for use by end users. Lenient in that
|
11
|
+
is will produce a parse tree for any input, given a default operator
|
12
|
+
and by generally ignoring any unparsable syntax. Suitable for use by
|
13
|
+
end users in that it supports potentially several operator variants
|
14
|
+
and a query language not unlike some major web search and other
|
15
|
+
commercial search engines.
|
16
|
+
|
17
|
+
The query language supports the following features at a high level:
|
18
|
+
|
19
|
+
* Boolean operators: AND (infix), OR (infix), NOT (prefix) with an
|
20
|
+
implied default operator and precedence rules,
|
21
|
+
e.g. "boy or girl -infant"
|
22
|
+
|
23
|
+
* Optional parenthesis for explicitly denoting precedence.
|
24
|
+
|
25
|
+
* Quoted phrases (for proximity matching)
|
26
|
+
|
27
|
+
* Declarable prefix scopes, e.g. "TITLE:(car or bike)"
|
28
|
+
|
29
|
+
The main components are each highly customizable:
|
30
|
+
|
31
|
+
HumanQL::QueryParser — Parses any arbitrary input string and outputs an
|
32
|
+
Abstract Syntax Tree (AST)
|
33
|
+
|
34
|
+
HumanQL::TreeNormalizer — Normalizes and imposes limits on an AST,
|
35
|
+
e.g. avoids pathological queries.
|
36
|
+
|
37
|
+
HumanQL::QueryGenerator — Given an AST, generates a Human Query
|
38
|
+
Language string.
|
39
|
+
|
40
|
+
HumanQL::PostgreSQLGenerator — Given an AST, generate strings suitable
|
41
|
+
for passing to PostgreSQL's to_tsquery function.
|
42
|
+
|
43
|
+
Other generators are possible.
|
44
|
+
|
45
|
+
== License
|
46
|
+
|
47
|
+
Copyright (c) 2016 David Kellum
|
48
|
+
|
49
|
+
Licensed under the Apache License, Version 2.0 (the "License"); you
|
50
|
+
may not use this file except in compliance with the License. You
|
51
|
+
may obtain a copy of the License at:
|
52
|
+
|
53
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
54
|
+
|
55
|
+
Unless required by applicable law or agreed to in writing, software
|
56
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
57
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
58
|
+
implied. See the License for the specific language governing
|
59
|
+
permissions and limitations under the License.
|
data/Rakefile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler/setup'
|
5
|
+
require 'rjack-tarpit'
|
6
|
+
|
7
|
+
RJack::TarPit.new( 'human-ql' ).define_tasks
|
8
|
+
|
9
|
+
desc "Upload RDOC to Amazon S3 (rdoc.gravitext.com/human-ql, Oregon)"
|
10
|
+
task :publish_rdoc => [ :clean, :rerdoc ] do
|
11
|
+
sh <<-SH
|
12
|
+
aws s3 sync --acl public-read doc/ s3://rdoc.gravitext.com/human-ql/
|
13
|
+
SH
|
14
|
+
end
|
data/lib/human-ql.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2016 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'human-ql/base'
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2016 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
module HumanQL
|
18
|
+
|
19
|
+
VERSION='1.0.0'
|
20
|
+
|
21
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2016 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'human-ql/query_parser'
|
18
|
+
|
19
|
+
module HumanQL
|
20
|
+
|
21
|
+
# Extends the generic QueryParser with additional special character
|
22
|
+
# filtering so as to avoid syntax errors in PostgreSQL to_tsquery()
|
23
|
+
# for any known input. Note that this is still a parser for the
|
24
|
+
# HumanQL query language, not anything implemented in PostgreSQL.
|
25
|
+
class PostgreSQLCustomParser < QueryParser
|
26
|
+
|
27
|
+
# Construct given options to set via base clase or as specified
|
28
|
+
# below.
|
29
|
+
#
|
30
|
+
# === Options
|
31
|
+
#
|
32
|
+
# :pg_version:: A version string ("9.5.5", "9.6.1") or integer
|
33
|
+
# array ( [9,6,1]) indicating the target PostgreSQL
|
34
|
+
# version. Phrase support starts in 9.6 so quoted
|
35
|
+
# phrases are ignored before that.
|
36
|
+
#
|
37
|
+
def initialize(opts = {})
|
38
|
+
opts = opts.dup
|
39
|
+
pg_version = opts.delete(:pg_version)
|
40
|
+
if pg_version.is_a?( String )
|
41
|
+
pg_version = pg_version.split( '.' ).map( &:to_i )
|
42
|
+
end
|
43
|
+
pg_version ||= []
|
44
|
+
|
45
|
+
super
|
46
|
+
|
47
|
+
# Phrase support starts in 9.6
|
48
|
+
if ( pg_version <=> [9,6] ) >= 0
|
49
|
+
@term_rejects = /[()|&:*!'<>]/
|
50
|
+
else
|
51
|
+
# Disable quote tokens and reject DQUOTE as token character
|
52
|
+
self.lquote = nil
|
53
|
+
self.rquote = nil
|
54
|
+
@term_rejects = /[()|&:*!'"<>]/
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
# Replace term_rejects characters with '_' which is punctuation
|
60
|
+
# (or effectively, whitespace) in tsquery with tested
|
61
|
+
# dictionaries.
|
62
|
+
def norm_term( t )
|
63
|
+
t.gsub( @term_rejects, '_' )
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2016 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
module HumanQL
|
18
|
+
|
19
|
+
# Generate query strings suitable for passing to PostgreSQL's
|
20
|
+
# to_tsquery function, from a HumanQL abstract syntax tree (AST).
|
21
|
+
#
|
22
|
+
# In order to guarantee valid output for any human input, the AST
|
23
|
+
# should be created using PostgreSQLCustomParser and normalized via
|
24
|
+
# TreeNormalizer (with minimal defaults).
|
25
|
+
#
|
26
|
+
# Any scope's provided in the parser should have been handled and
|
27
|
+
# stripped out of the AST, as PostgreSQL is not expected to have a
|
28
|
+
# direct equivalent in tsquery syntax.
|
29
|
+
class PostgreSQLGenerator
|
30
|
+
|
31
|
+
#--
|
32
|
+
# From https://www.postgresql.org/docs/9.6/static/datatype-textsearch.html
|
33
|
+
# > In the absence of parentheses, '!' (NOT) binds most tightly,
|
34
|
+
# > and '&' (AND) and '<->' (FOLLOWED BY) both bind more tightly
|
35
|
+
# > than | (OR).
|
36
|
+
#++
|
37
|
+
|
38
|
+
AND = ' & '.freeze
|
39
|
+
OR = ' | '.freeze
|
40
|
+
NOT = '!'.freeze
|
41
|
+
NEAR = ' <-> '.freeze
|
42
|
+
|
43
|
+
# Given the root node of the AST, return a string in PostgreSQL
|
44
|
+
# tsquery syntax.
|
45
|
+
def generate( node )
|
46
|
+
op,*args = node
|
47
|
+
if ! node.is_a?( Array )
|
48
|
+
op
|
49
|
+
elsif args.empty?
|
50
|
+
nil
|
51
|
+
else
|
52
|
+
case op
|
53
|
+
when :and
|
54
|
+
terms_join( args, AND )
|
55
|
+
when :or
|
56
|
+
pwrap( terms_join( args, OR ) )
|
57
|
+
when :not
|
58
|
+
if args[0].is_a?( Array )
|
59
|
+
NOT + pwrap( generate( args[0] ) )
|
60
|
+
else
|
61
|
+
NOT + args[0]
|
62
|
+
end
|
63
|
+
when :phrase
|
64
|
+
terms_join( args, NEAR )
|
65
|
+
else
|
66
|
+
raise "Unsupported op: #{node.inspect}"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
protected
|
72
|
+
|
73
|
+
def terms_join( args, op )
|
74
|
+
args.map { |a| generate( a ) }.join( op )
|
75
|
+
end
|
76
|
+
|
77
|
+
def pwrap( inner )
|
78
|
+
'(' + inner + ')'
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
@@ -0,0 +1,163 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2016 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
module HumanQL
|
18
|
+
|
19
|
+
# Generate a Human Query Language String from an abstract syntax
|
20
|
+
# tree (AST). This allows query simplification (e.g. via
|
21
|
+
# TreeNormalizer) and re-writing queries.
|
22
|
+
class QueryGenerator
|
23
|
+
|
24
|
+
# The AND operator (if not default).
|
25
|
+
# Default: ' and '
|
26
|
+
attr_accessor :and
|
27
|
+
|
28
|
+
# The OR operator (if not default).
|
29
|
+
# Default: ' or '
|
30
|
+
attr_accessor :or
|
31
|
+
|
32
|
+
# The NOT operator.
|
33
|
+
# Default: '-'
|
34
|
+
attr_accessor :not
|
35
|
+
|
36
|
+
# SPACE delimiter.
|
37
|
+
# Default: ' '
|
38
|
+
attr_accessor :space
|
39
|
+
|
40
|
+
# Left quote character for phrases.
|
41
|
+
# Default: '"'
|
42
|
+
attr_accessor :lquote
|
43
|
+
|
44
|
+
# Right quote character for phrases.
|
45
|
+
# Default: '"'
|
46
|
+
attr_accessor :rquote
|
47
|
+
|
48
|
+
# COLON character used a prefix delimiter.
|
49
|
+
# Default: ':'
|
50
|
+
attr_accessor :colon
|
51
|
+
|
52
|
+
# Left parenthesis character.
|
53
|
+
# Default: '('
|
54
|
+
attr_accessor :lparen
|
55
|
+
|
56
|
+
# Right parenthesis character.
|
57
|
+
# Default: ')'
|
58
|
+
attr_accessor :rparen
|
59
|
+
|
60
|
+
# The default operator (:and or :or). If set, will output a :space
|
61
|
+
# instead of the operator.
|
62
|
+
# Default: nil
|
63
|
+
attr_accessor :default_op
|
64
|
+
|
65
|
+
# Hash of operators to precedence integer values, as per
|
66
|
+
# QueryParser#precedence. If set, outputs parentheses only when
|
67
|
+
# precedence dictates that it is necessary.
|
68
|
+
# Default: nil
|
69
|
+
attr_accessor :precedence
|
70
|
+
|
71
|
+
# Set #default_op and #precedence from the given QueryParser, as a
|
72
|
+
# convenience.
|
73
|
+
def parser=( p )
|
74
|
+
@default_op = p.default_op
|
75
|
+
@precedence = p.precedence
|
76
|
+
end
|
77
|
+
|
78
|
+
# Construct given options which are interpreted as attribute names
|
79
|
+
# to set.
|
80
|
+
def initialize( opts = {} )
|
81
|
+
@and = ' and '.freeze
|
82
|
+
@or = ' or '.freeze
|
83
|
+
@not = '-'.freeze
|
84
|
+
@space = ' '.freeze
|
85
|
+
@lquote = @rquote = '"'.freeze
|
86
|
+
@colon = ':'.freeze
|
87
|
+
@lparen = '('.freeze
|
88
|
+
@rparen = ')'.freeze
|
89
|
+
@default_op = nil
|
90
|
+
@precedence = nil
|
91
|
+
|
92
|
+
opts.each do |name,val|
|
93
|
+
send( name.to_s + '=', val )
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# Given the root node of the AST, return a String in Human Query
|
98
|
+
# Language syntax.
|
99
|
+
def generate( node )
|
100
|
+
op,*args = node
|
101
|
+
if ! node.is_a?( Array )
|
102
|
+
op
|
103
|
+
elsif args.empty?
|
104
|
+
nil
|
105
|
+
else
|
106
|
+
case op
|
107
|
+
when :and
|
108
|
+
terms_join( args, :and )
|
109
|
+
when :or
|
110
|
+
terms_join( args, :or )
|
111
|
+
when :not
|
112
|
+
@not + pwrap_gen( args[0], op )
|
113
|
+
when :phrase
|
114
|
+
@lquote + args.join( @space ) + @rquote
|
115
|
+
when String
|
116
|
+
op + @colon + pwrap_gen( args[0], op )
|
117
|
+
else
|
118
|
+
raise "Unsupported op: #{node.inspect}"
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
protected
|
124
|
+
|
125
|
+
def terms_join( args, op )
|
126
|
+
args = args.map { |a| pwrap_gen( a, op ) }
|
127
|
+
if op == @default_op
|
128
|
+
args.join( @space )
|
129
|
+
elsif op == :and
|
130
|
+
args.join( @and )
|
131
|
+
elsif op == :or
|
132
|
+
args.join( @or )
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
def pwrap_gen( node, parent_op )
|
137
|
+
if node.is_a?( Array )
|
138
|
+
op = node[0]
|
139
|
+
if precedence_lte?( parent_op, op )
|
140
|
+
generate( node )
|
141
|
+
else
|
142
|
+
pwrap( generate( node ) )
|
143
|
+
end
|
144
|
+
else
|
145
|
+
node
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
def pwrap( inner )
|
150
|
+
@lparen + inner + @rparen
|
151
|
+
end
|
152
|
+
|
153
|
+
def precedence_lte?( op1, op2 )
|
154
|
+
if @precedence
|
155
|
+
@precedence[op1] <= @precedence[op2]
|
156
|
+
else
|
157
|
+
false
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
end
|
162
|
+
|
163
|
+
end
|