human-ql 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/History.rdoc +2 -0
- data/Manifest.txt +17 -0
- data/README.rdoc +59 -0
- data/Rakefile +14 -0
- data/lib/human-ql.rb +17 -0
- data/lib/human-ql/base.rb +21 -0
- data/lib/human-ql/postgresql_custom_parser.rb +67 -0
- data/lib/human-ql/postgresql_generator.rb +83 -0
- data/lib/human-ql/query_generator.rb +163 -0
- data/lib/human-ql/query_parser.rb +498 -0
- data/lib/human-ql/tree_normalizer.rb +226 -0
- data/test/setup.rb +24 -0
- data/test/test_postgresql_fuzz.rb +90 -0
- data/test/test_postgresql_generator.rb +171 -0
- data/test/test_query_generator.rb +87 -0
- data/test/test_query_parser.rb +306 -0
- data/test/test_tree_normalizer.rb +153 -0
- metadata +143 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 25023c260b71ece677718cfc8aad134860ced434
|
4
|
+
data.tar.gz: f3d8d6e3fad057bd458292b384a3b2f0ed0c5595
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 83492efda22be913723385c49689f65f86f1520b4f0a3c460ff9a57e565715ef71b727ceddff7756f451fdbf9c9bf0d143ea148798ec3bc26ddb2f9472140513
|
7
|
+
data.tar.gz: dcf2c5edb12cb6f2cc38aff7698f89472df203585f93898023f1f7f1c4f15af67a64e66ee75eb596202c6cbb3625c88039e1b5fbeda7a72ca1e41af6c914fd12
|
data/History.rdoc
ADDED
data/Manifest.txt
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
History.rdoc
|
2
|
+
Manifest.txt
|
3
|
+
README.rdoc
|
4
|
+
Rakefile
|
5
|
+
lib/human-ql/base.rb
|
6
|
+
lib/human-ql.rb
|
7
|
+
lib/human-ql/postgresql_custom_parser.rb
|
8
|
+
lib/human-ql/postgresql_generator.rb
|
9
|
+
lib/human-ql/query_generator.rb
|
10
|
+
lib/human-ql/query_parser.rb
|
11
|
+
lib/human-ql/tree_normalizer.rb
|
12
|
+
test/setup.rb
|
13
|
+
test/test_postgresql_fuzz.rb
|
14
|
+
test/test_postgresql_generator.rb
|
15
|
+
test/test_query_generator.rb
|
16
|
+
test/test_query_parser.rb
|
17
|
+
test/test_tree_normalizer.rb
|
data/README.rdoc
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
= HumanQL
|
2
|
+
|
3
|
+
* http://github.com/dekellum/human-ql
|
4
|
+
* http://rdoc.gravitext.com/human-ql/
|
5
|
+
|
6
|
+
== Description
|
7
|
+
|
8
|
+
Human Query Language for full text search engines. Provides a lenient
|
9
|
+
parser and associated tools for a self-contained and search-engine
|
10
|
+
agnostic query language suitable for use by end users. Lenient in that
|
11
|
+
is will produce a parse tree for any input, given a default operator
|
12
|
+
and by generally ignoring any unparsable syntax. Suitable for use by
|
13
|
+
end users in that it supports potentially several operator variants
|
14
|
+
and a query language not unlike some major web search and other
|
15
|
+
commercial search engines.
|
16
|
+
|
17
|
+
The query language supports the following features at a high level:
|
18
|
+
|
19
|
+
* Boolean operators: AND (infix), OR (infix), NOT (prefix) with an
|
20
|
+
implied default operator and precedence rules,
|
21
|
+
e.g. "boy or girl -infant"
|
22
|
+
|
23
|
+
* Optional parenthesis for explicitly denoting precedence.
|
24
|
+
|
25
|
+
* Quoted phrases (for proximity matching)
|
26
|
+
|
27
|
+
* Declarable prefix scopes, e.g. "TITLE:(car or bike)"
|
28
|
+
|
29
|
+
The main components are each highly customizable:
|
30
|
+
|
31
|
+
HumanQL::QueryParser — Parses any arbitrary input string and outputs an
|
32
|
+
Abstract Syntax Tree (AST)
|
33
|
+
|
34
|
+
HumanQL::TreeNormalizer — Normalizes and imposes limits on an AST,
|
35
|
+
e.g. avoids pathological queries.
|
36
|
+
|
37
|
+
HumanQL::QueryGenerator — Given an AST, generates a Human Query
|
38
|
+
Language string.
|
39
|
+
|
40
|
+
HumanQL::PostgreSQLGenerator — Given an AST, generate strings suitable
|
41
|
+
for passing to PostgreSQL's to_tsquery function.
|
42
|
+
|
43
|
+
Other generators are possible.
|
44
|
+
|
45
|
+
== License
|
46
|
+
|
47
|
+
Copyright (c) 2016 David Kellum
|
48
|
+
|
49
|
+
Licensed under the Apache License, Version 2.0 (the "License"); you
|
50
|
+
may not use this file except in compliance with the License. You
|
51
|
+
may obtain a copy of the License at:
|
52
|
+
|
53
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
54
|
+
|
55
|
+
Unless required by applicable law or agreed to in writing, software
|
56
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
57
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
58
|
+
implied. See the License for the specific language governing
|
59
|
+
permissions and limitations under the License.
|
data/Rakefile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler/setup'
|
5
|
+
require 'rjack-tarpit'
|
6
|
+
|
7
|
+
RJack::TarPit.new( 'human-ql' ).define_tasks
|
8
|
+
|
9
|
+
desc "Upload RDOC to Amazon S3 (rdoc.gravitext.com/human-ql, Oregon)"
|
10
|
+
task :publish_rdoc => [ :clean, :rerdoc ] do
|
11
|
+
sh <<-SH
|
12
|
+
aws s3 sync --acl public-read doc/ s3://rdoc.gravitext.com/human-ql/
|
13
|
+
SH
|
14
|
+
end
|
data/lib/human-ql.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2016 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'human-ql/base'
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2016 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
module HumanQL
|
18
|
+
|
19
|
+
VERSION='1.0.0'
|
20
|
+
|
21
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2016 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'human-ql/query_parser'
|
18
|
+
|
19
|
+
module HumanQL
|
20
|
+
|
21
|
+
# Extends the generic QueryParser with additional special character
|
22
|
+
# filtering so as to avoid syntax errors in PostgreSQL to_tsquery()
|
23
|
+
# for any known input. Note that this is still a parser for the
|
24
|
+
# HumanQL query language, not anything implemented in PostgreSQL.
|
25
|
+
class PostgreSQLCustomParser < QueryParser
|
26
|
+
|
27
|
+
# Construct given options to set via base clase or as specified
|
28
|
+
# below.
|
29
|
+
#
|
30
|
+
# === Options
|
31
|
+
#
|
32
|
+
# :pg_version:: A version string ("9.5.5", "9.6.1") or integer
|
33
|
+
# array ( [9,6,1]) indicating the target PostgreSQL
|
34
|
+
# version. Phrase support starts in 9.6 so quoted
|
35
|
+
# phrases are ignored before that.
|
36
|
+
#
|
37
|
+
def initialize(opts = {})
|
38
|
+
opts = opts.dup
|
39
|
+
pg_version = opts.delete(:pg_version)
|
40
|
+
if pg_version.is_a?( String )
|
41
|
+
pg_version = pg_version.split( '.' ).map( &:to_i )
|
42
|
+
end
|
43
|
+
pg_version ||= []
|
44
|
+
|
45
|
+
super
|
46
|
+
|
47
|
+
# Phrase support starts in 9.6
|
48
|
+
if ( pg_version <=> [9,6] ) >= 0
|
49
|
+
@term_rejects = /[()|&:*!'<>]/
|
50
|
+
else
|
51
|
+
# Disable quote tokens and reject DQUOTE as token character
|
52
|
+
self.lquote = nil
|
53
|
+
self.rquote = nil
|
54
|
+
@term_rejects = /[()|&:*!'"<>]/
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
# Replace term_rejects characters with '_' which is punctuation
|
60
|
+
# (or effectively, whitespace) in tsquery with tested
|
61
|
+
# dictionaries.
|
62
|
+
def norm_term( t )
|
63
|
+
t.gsub( @term_rejects, '_' )
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2016 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
module HumanQL
|
18
|
+
|
19
|
+
# Generate query strings suitable for passing to PostgreSQL's
|
20
|
+
# to_tsquery function, from a HumanQL abstract syntax tree (AST).
|
21
|
+
#
|
22
|
+
# In order to guarantee valid output for any human input, the AST
|
23
|
+
# should be created using PostgreSQLCustomParser and normalized via
|
24
|
+
# TreeNormalizer (with minimal defaults).
|
25
|
+
#
|
26
|
+
# Any scope's provided in the parser should have been handled and
|
27
|
+
# stripped out of the AST, as PostgreSQL is not expected to have a
|
28
|
+
# direct equivalent in tsquery syntax.
|
29
|
+
class PostgreSQLGenerator
|
30
|
+
|
31
|
+
#--
|
32
|
+
# From https://www.postgresql.org/docs/9.6/static/datatype-textsearch.html
|
33
|
+
# > In the absence of parentheses, '!' (NOT) binds most tightly,
|
34
|
+
# > and '&' (AND) and '<->' (FOLLOWED BY) both bind more tightly
|
35
|
+
# > than | (OR).
|
36
|
+
#++
|
37
|
+
|
38
|
+
AND = ' & '.freeze
|
39
|
+
OR = ' | '.freeze
|
40
|
+
NOT = '!'.freeze
|
41
|
+
NEAR = ' <-> '.freeze
|
42
|
+
|
43
|
+
# Given the root node of the AST, return a string in PostgreSQL
|
44
|
+
# tsquery syntax.
|
45
|
+
def generate( node )
|
46
|
+
op,*args = node
|
47
|
+
if ! node.is_a?( Array )
|
48
|
+
op
|
49
|
+
elsif args.empty?
|
50
|
+
nil
|
51
|
+
else
|
52
|
+
case op
|
53
|
+
when :and
|
54
|
+
terms_join( args, AND )
|
55
|
+
when :or
|
56
|
+
pwrap( terms_join( args, OR ) )
|
57
|
+
when :not
|
58
|
+
if args[0].is_a?( Array )
|
59
|
+
NOT + pwrap( generate( args[0] ) )
|
60
|
+
else
|
61
|
+
NOT + args[0]
|
62
|
+
end
|
63
|
+
when :phrase
|
64
|
+
terms_join( args, NEAR )
|
65
|
+
else
|
66
|
+
raise "Unsupported op: #{node.inspect}"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
protected
|
72
|
+
|
73
|
+
def terms_join( args, op )
|
74
|
+
args.map { |a| generate( a ) }.join( op )
|
75
|
+
end
|
76
|
+
|
77
|
+
def pwrap( inner )
|
78
|
+
'(' + inner + ')'
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
@@ -0,0 +1,163 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2016 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
module HumanQL
|
18
|
+
|
19
|
+
# Generate a Human Query Language String from an abstract syntax
|
20
|
+
# tree (AST). This allows query simplification (e.g. via
|
21
|
+
# TreeNormalizer) and re-writing queries.
|
22
|
+
class QueryGenerator
|
23
|
+
|
24
|
+
# The AND operator (if not default).
|
25
|
+
# Default: ' and '
|
26
|
+
attr_accessor :and
|
27
|
+
|
28
|
+
# The OR operator (if not default).
|
29
|
+
# Default: ' or '
|
30
|
+
attr_accessor :or
|
31
|
+
|
32
|
+
# The NOT operator.
|
33
|
+
# Default: '-'
|
34
|
+
attr_accessor :not
|
35
|
+
|
36
|
+
# SPACE delimiter.
|
37
|
+
# Default: ' '
|
38
|
+
attr_accessor :space
|
39
|
+
|
40
|
+
# Left quote character for phrases.
|
41
|
+
# Default: '"'
|
42
|
+
attr_accessor :lquote
|
43
|
+
|
44
|
+
# Right quote character for phrases.
|
45
|
+
# Default: '"'
|
46
|
+
attr_accessor :rquote
|
47
|
+
|
48
|
+
# COLON character used a prefix delimiter.
|
49
|
+
# Default: ':'
|
50
|
+
attr_accessor :colon
|
51
|
+
|
52
|
+
# Left parenthesis character.
|
53
|
+
# Default: '('
|
54
|
+
attr_accessor :lparen
|
55
|
+
|
56
|
+
# Right parenthesis character.
|
57
|
+
# Default: ')'
|
58
|
+
attr_accessor :rparen
|
59
|
+
|
60
|
+
# The default operator (:and or :or). If set, will output a :space
|
61
|
+
# instead of the operator.
|
62
|
+
# Default: nil
|
63
|
+
attr_accessor :default_op
|
64
|
+
|
65
|
+
# Hash of operators to precedence integer values, as per
|
66
|
+
# QueryParser#precedence. If set, outputs parentheses only when
|
67
|
+
# precedence dictates that it is necessary.
|
68
|
+
# Default: nil
|
69
|
+
attr_accessor :precedence
|
70
|
+
|
71
|
+
# Set #default_op and #precedence from the given QueryParser, as a
|
72
|
+
# convenience.
|
73
|
+
def parser=( p )
|
74
|
+
@default_op = p.default_op
|
75
|
+
@precedence = p.precedence
|
76
|
+
end
|
77
|
+
|
78
|
+
# Construct given options which are interpreted as attribute names
|
79
|
+
# to set.
|
80
|
+
def initialize( opts = {} )
|
81
|
+
@and = ' and '.freeze
|
82
|
+
@or = ' or '.freeze
|
83
|
+
@not = '-'.freeze
|
84
|
+
@space = ' '.freeze
|
85
|
+
@lquote = @rquote = '"'.freeze
|
86
|
+
@colon = ':'.freeze
|
87
|
+
@lparen = '('.freeze
|
88
|
+
@rparen = ')'.freeze
|
89
|
+
@default_op = nil
|
90
|
+
@precedence = nil
|
91
|
+
|
92
|
+
opts.each do |name,val|
|
93
|
+
send( name.to_s + '=', val )
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# Given the root node of the AST, return a String in Human Query
|
98
|
+
# Language syntax.
|
99
|
+
def generate( node )
|
100
|
+
op,*args = node
|
101
|
+
if ! node.is_a?( Array )
|
102
|
+
op
|
103
|
+
elsif args.empty?
|
104
|
+
nil
|
105
|
+
else
|
106
|
+
case op
|
107
|
+
when :and
|
108
|
+
terms_join( args, :and )
|
109
|
+
when :or
|
110
|
+
terms_join( args, :or )
|
111
|
+
when :not
|
112
|
+
@not + pwrap_gen( args[0], op )
|
113
|
+
when :phrase
|
114
|
+
@lquote + args.join( @space ) + @rquote
|
115
|
+
when String
|
116
|
+
op + @colon + pwrap_gen( args[0], op )
|
117
|
+
else
|
118
|
+
raise "Unsupported op: #{node.inspect}"
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
protected
|
124
|
+
|
125
|
+
def terms_join( args, op )
|
126
|
+
args = args.map { |a| pwrap_gen( a, op ) }
|
127
|
+
if op == @default_op
|
128
|
+
args.join( @space )
|
129
|
+
elsif op == :and
|
130
|
+
args.join( @and )
|
131
|
+
elsif op == :or
|
132
|
+
args.join( @or )
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
def pwrap_gen( node, parent_op )
|
137
|
+
if node.is_a?( Array )
|
138
|
+
op = node[0]
|
139
|
+
if precedence_lte?( parent_op, op )
|
140
|
+
generate( node )
|
141
|
+
else
|
142
|
+
pwrap( generate( node ) )
|
143
|
+
end
|
144
|
+
else
|
145
|
+
node
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
def pwrap( inner )
|
150
|
+
@lparen + inner + @rparen
|
151
|
+
end
|
152
|
+
|
153
|
+
def precedence_lte?( op1, op2 )
|
154
|
+
if @precedence
|
155
|
+
@precedence[op1] <= @precedence[op2]
|
156
|
+
else
|
157
|
+
false
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
end
|
162
|
+
|
163
|
+
end
|