human-ql 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 25023c260b71ece677718cfc8aad134860ced434
4
+ data.tar.gz: f3d8d6e3fad057bd458292b384a3b2f0ed0c5595
5
+ SHA512:
6
+ metadata.gz: 83492efda22be913723385c49689f65f86f1520b4f0a3c460ff9a57e565715ef71b727ceddff7756f451fdbf9c9bf0d143ea148798ec3bc26ddb2f9472140513
7
+ data.tar.gz: dcf2c5edb12cb6f2cc38aff7698f89472df203585f93898023f1f7f1c4f15af67a64e66ee75eb596202c6cbb3625c88039e1b5fbeda7a72ca1e41af6c914fd12
@@ -0,0 +1,2 @@
1
+ === 1.0.0 (2016-11-8)
2
+ * Initial release.
@@ -0,0 +1,17 @@
1
+ History.rdoc
2
+ Manifest.txt
3
+ README.rdoc
4
+ Rakefile
5
+ lib/human-ql/base.rb
6
+ lib/human-ql.rb
7
+ lib/human-ql/postgresql_custom_parser.rb
8
+ lib/human-ql/postgresql_generator.rb
9
+ lib/human-ql/query_generator.rb
10
+ lib/human-ql/query_parser.rb
11
+ lib/human-ql/tree_normalizer.rb
12
+ test/setup.rb
13
+ test/test_postgresql_fuzz.rb
14
+ test/test_postgresql_generator.rb
15
+ test/test_query_generator.rb
16
+ test/test_query_parser.rb
17
+ test/test_tree_normalizer.rb
@@ -0,0 +1,59 @@
1
+ = HumanQL
2
+
3
+ * http://github.com/dekellum/human-ql
4
+ * http://rdoc.gravitext.com/human-ql/
5
+
6
+ == Description
7
+
8
+ Human Query Language for full text search engines. Provides a lenient
9
+ parser and associated tools for a self-contained and search-engine
10
+ agnostic query language suitable for use by end users. Lenient in that
11
+ is will produce a parse tree for any input, given a default operator
12
+ and by generally ignoring any unparsable syntax. Suitable for use by
13
+ end users in that it supports potentially several operator variants
14
+ and a query language not unlike some major web search and other
15
+ commercial search engines.
16
+
17
+ The query language supports the following features at a high level:
18
+
19
+ * Boolean operators: AND (infix), OR (infix), NOT (prefix) with an
20
+ implied default operator and precedence rules,
21
+ e.g. "boy or girl -infant"
22
+
23
+ * Optional parenthesis for explicitly denoting precedence.
24
+
25
+ * Quoted phrases (for proximity matching)
26
+
27
+ * Declarable prefix scopes, e.g. "TITLE:(car or bike)"
28
+
29
+ The main components are each highly customizable:
30
+
31
+ HumanQL::QueryParser — Parses any arbitrary input string and outputs an
32
+ Abstract Syntax Tree (AST)
33
+
34
+ HumanQL::TreeNormalizer — Normalizes and imposes limits on an AST,
35
+ e.g. avoids pathological queries.
36
+
37
+ HumanQL::QueryGenerator — Given an AST, generates a Human Query
38
+ Language string.
39
+
40
+ HumanQL::PostgreSQLGenerator — Given an AST, generate strings suitable
41
+ for passing to PostgreSQL's to_tsquery function.
42
+
43
+ Other generators are possible.
44
+
45
+ == License
46
+
47
+ Copyright (c) 2016 David Kellum
48
+
49
+ Licensed under the Apache License, Version 2.0 (the "License"); you
50
+ may not use this file except in compliance with the License. You
51
+ may obtain a copy of the License at:
52
+
53
+ http://www.apache.org/licenses/LICENSE-2.0
54
+
55
+ Unless required by applicable law or agreed to in writing, software
56
+ distributed under the License is distributed on an "AS IS" BASIS,
57
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
58
+ implied. See the License for the specific language governing
59
+ permissions and limitations under the License.
@@ -0,0 +1,14 @@
1
+ # -*- ruby -*-
2
+
3
+ require 'rubygems'
4
+ require 'bundler/setup'
5
+ require 'rjack-tarpit'
6
+
7
+ RJack::TarPit.new( 'human-ql' ).define_tasks
8
+
9
+ desc "Upload RDOC to Amazon S3 (rdoc.gravitext.com/human-ql, Oregon)"
10
+ task :publish_rdoc => [ :clean, :rerdoc ] do
11
+ sh <<-SH
12
+ aws s3 sync --acl public-read doc/ s3://rdoc.gravitext.com/human-ql/
13
+ SH
14
+ end
@@ -0,0 +1,17 @@
1
+ #--
2
+ # Copyright (c) 2016 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'human-ql/base'
@@ -0,0 +1,21 @@
1
+ #--
2
+ # Copyright (c) 2016 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ module HumanQL
18
+
19
+ VERSION='1.0.0'
20
+
21
+ end
@@ -0,0 +1,67 @@
1
+ #--
2
+ # Copyright (c) 2016 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'human-ql/query_parser'
18
+
19
+ module HumanQL
20
+
21
+ # Extends the generic QueryParser with additional special character
22
+ # filtering so as to avoid syntax errors in PostgreSQL to_tsquery()
23
+ # for any known input. Note that this is still a parser for the
24
+ # HumanQL query language, not anything implemented in PostgreSQL.
25
+ class PostgreSQLCustomParser < QueryParser
26
+
27
+ # Construct given options to set via base clase or as specified
28
+ # below.
29
+ #
30
+ # === Options
31
+ #
32
+ # :pg_version:: A version string ("9.5.5", "9.6.1") or integer
33
+ # array ( [9,6,1]) indicating the target PostgreSQL
34
+ # version. Phrase support starts in 9.6 so quoted
35
+ # phrases are ignored before that.
36
+ #
37
+ def initialize(opts = {})
38
+ opts = opts.dup
39
+ pg_version = opts.delete(:pg_version)
40
+ if pg_version.is_a?( String )
41
+ pg_version = pg_version.split( '.' ).map( &:to_i )
42
+ end
43
+ pg_version ||= []
44
+
45
+ super
46
+
47
+ # Phrase support starts in 9.6
48
+ if ( pg_version <=> [9,6] ) >= 0
49
+ @term_rejects = /[()|&:*!'<>]/
50
+ else
51
+ # Disable quote tokens and reject DQUOTE as token character
52
+ self.lquote = nil
53
+ self.rquote = nil
54
+ @term_rejects = /[()|&:*!'"<>]/
55
+ end
56
+
57
+ end
58
+
59
+ # Replace term_rejects characters with '_' which is punctuation
60
+ # (or effectively, whitespace) in tsquery with tested
61
+ # dictionaries.
62
+ def norm_term( t )
63
+ t.gsub( @term_rejects, '_' )
64
+ end
65
+ end
66
+
67
+ end
@@ -0,0 +1,83 @@
1
+ #--
2
+ # Copyright (c) 2016 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ module HumanQL
18
+
19
+ # Generate query strings suitable for passing to PostgreSQL's
20
+ # to_tsquery function, from a HumanQL abstract syntax tree (AST).
21
+ #
22
+ # In order to guarantee valid output for any human input, the AST
23
+ # should be created using PostgreSQLCustomParser and normalized via
24
+ # TreeNormalizer (with minimal defaults).
25
+ #
26
+ # Any scope's provided in the parser should have been handled and
27
+ # stripped out of the AST, as PostgreSQL is not expected to have a
28
+ # direct equivalent in tsquery syntax.
29
+ class PostgreSQLGenerator
30
+
31
+ #--
32
+ # From https://www.postgresql.org/docs/9.6/static/datatype-textsearch.html
33
+ # > In the absence of parentheses, '!' (NOT) binds most tightly,
34
+ # > and '&' (AND) and '<->' (FOLLOWED BY) both bind more tightly
35
+ # > than | (OR).
36
+ #++
37
+
38
+ AND = ' & '.freeze
39
+ OR = ' | '.freeze
40
+ NOT = '!'.freeze
41
+ NEAR = ' <-> '.freeze
42
+
43
+ # Given the root node of the AST, return a string in PostgreSQL
44
+ # tsquery syntax.
45
+ def generate( node )
46
+ op,*args = node
47
+ if ! node.is_a?( Array )
48
+ op
49
+ elsif args.empty?
50
+ nil
51
+ else
52
+ case op
53
+ when :and
54
+ terms_join( args, AND )
55
+ when :or
56
+ pwrap( terms_join( args, OR ) )
57
+ when :not
58
+ if args[0].is_a?( Array )
59
+ NOT + pwrap( generate( args[0] ) )
60
+ else
61
+ NOT + args[0]
62
+ end
63
+ when :phrase
64
+ terms_join( args, NEAR )
65
+ else
66
+ raise "Unsupported op: #{node.inspect}"
67
+ end
68
+ end
69
+ end
70
+
71
+ protected
72
+
73
+ def terms_join( args, op )
74
+ args.map { |a| generate( a ) }.join( op )
75
+ end
76
+
77
+ def pwrap( inner )
78
+ '(' + inner + ')'
79
+ end
80
+
81
+ end
82
+
83
+ end
@@ -0,0 +1,163 @@
1
+ #--
2
+ # Copyright (c) 2016 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ module HumanQL
18
+
19
+ # Generate a Human Query Language String from an abstract syntax
20
+ # tree (AST). This allows query simplification (e.g. via
21
+ # TreeNormalizer) and re-writing queries.
22
+ class QueryGenerator
23
+
24
+ # The AND operator (if not default).
25
+ # Default: ' and '
26
+ attr_accessor :and
27
+
28
+ # The OR operator (if not default).
29
+ # Default: ' or '
30
+ attr_accessor :or
31
+
32
+ # The NOT operator.
33
+ # Default: '-'
34
+ attr_accessor :not
35
+
36
+ # SPACE delimiter.
37
+ # Default: ' '
38
+ attr_accessor :space
39
+
40
+ # Left quote character for phrases.
41
+ # Default: '"'
42
+ attr_accessor :lquote
43
+
44
+ # Right quote character for phrases.
45
+ # Default: '"'
46
+ attr_accessor :rquote
47
+
48
+ # COLON character used a prefix delimiter.
49
+ # Default: ':'
50
+ attr_accessor :colon
51
+
52
+ # Left parenthesis character.
53
+ # Default: '('
54
+ attr_accessor :lparen
55
+
56
+ # Right parenthesis character.
57
+ # Default: ')'
58
+ attr_accessor :rparen
59
+
60
+ # The default operator (:and or :or). If set, will output a :space
61
+ # instead of the operator.
62
+ # Default: nil
63
+ attr_accessor :default_op
64
+
65
+ # Hash of operators to precedence integer values, as per
66
+ # QueryParser#precedence. If set, outputs parentheses only when
67
+ # precedence dictates that it is necessary.
68
+ # Default: nil
69
+ attr_accessor :precedence
70
+
71
+ # Set #default_op and #precedence from the given QueryParser, as a
72
+ # convenience.
73
+ def parser=( p )
74
+ @default_op = p.default_op
75
+ @precedence = p.precedence
76
+ end
77
+
78
+ # Construct given options which are interpreted as attribute names
79
+ # to set.
80
+ def initialize( opts = {} )
81
+ @and = ' and '.freeze
82
+ @or = ' or '.freeze
83
+ @not = '-'.freeze
84
+ @space = ' '.freeze
85
+ @lquote = @rquote = '"'.freeze
86
+ @colon = ':'.freeze
87
+ @lparen = '('.freeze
88
+ @rparen = ')'.freeze
89
+ @default_op = nil
90
+ @precedence = nil
91
+
92
+ opts.each do |name,val|
93
+ send( name.to_s + '=', val )
94
+ end
95
+ end
96
+
97
+ # Given the root node of the AST, return a String in Human Query
98
+ # Language syntax.
99
+ def generate( node )
100
+ op,*args = node
101
+ if ! node.is_a?( Array )
102
+ op
103
+ elsif args.empty?
104
+ nil
105
+ else
106
+ case op
107
+ when :and
108
+ terms_join( args, :and )
109
+ when :or
110
+ terms_join( args, :or )
111
+ when :not
112
+ @not + pwrap_gen( args[0], op )
113
+ when :phrase
114
+ @lquote + args.join( @space ) + @rquote
115
+ when String
116
+ op + @colon + pwrap_gen( args[0], op )
117
+ else
118
+ raise "Unsupported op: #{node.inspect}"
119
+ end
120
+ end
121
+ end
122
+
123
+ protected
124
+
125
+ def terms_join( args, op )
126
+ args = args.map { |a| pwrap_gen( a, op ) }
127
+ if op == @default_op
128
+ args.join( @space )
129
+ elsif op == :and
130
+ args.join( @and )
131
+ elsif op == :or
132
+ args.join( @or )
133
+ end
134
+ end
135
+
136
+ def pwrap_gen( node, parent_op )
137
+ if node.is_a?( Array )
138
+ op = node[0]
139
+ if precedence_lte?( parent_op, op )
140
+ generate( node )
141
+ else
142
+ pwrap( generate( node ) )
143
+ end
144
+ else
145
+ node
146
+ end
147
+ end
148
+
149
+ def pwrap( inner )
150
+ @lparen + inner + @rparen
151
+ end
152
+
153
+ def precedence_lte?( op1, op2 )
154
+ if @precedence
155
+ @precedence[op1] <= @precedence[op2]
156
+ else
157
+ false
158
+ end
159
+ end
160
+
161
+ end
162
+
163
+ end