human-ql 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 25023c260b71ece677718cfc8aad134860ced434
4
+ data.tar.gz: f3d8d6e3fad057bd458292b384a3b2f0ed0c5595
5
+ SHA512:
6
+ metadata.gz: 83492efda22be913723385c49689f65f86f1520b4f0a3c460ff9a57e565715ef71b727ceddff7756f451fdbf9c9bf0d143ea148798ec3bc26ddb2f9472140513
7
+ data.tar.gz: dcf2c5edb12cb6f2cc38aff7698f89472df203585f93898023f1f7f1c4f15af67a64e66ee75eb596202c6cbb3625c88039e1b5fbeda7a72ca1e41af6c914fd12
@@ -0,0 +1,2 @@
1
+ === 1.0.0 (2016-11-8)
2
+ * Initial release.
@@ -0,0 +1,17 @@
1
+ History.rdoc
2
+ Manifest.txt
3
+ README.rdoc
4
+ Rakefile
5
+ lib/human-ql/base.rb
6
+ lib/human-ql.rb
7
+ lib/human-ql/postgresql_custom_parser.rb
8
+ lib/human-ql/postgresql_generator.rb
9
+ lib/human-ql/query_generator.rb
10
+ lib/human-ql/query_parser.rb
11
+ lib/human-ql/tree_normalizer.rb
12
+ test/setup.rb
13
+ test/test_postgresql_fuzz.rb
14
+ test/test_postgresql_generator.rb
15
+ test/test_query_generator.rb
16
+ test/test_query_parser.rb
17
+ test/test_tree_normalizer.rb
@@ -0,0 +1,59 @@
1
+ = HumanQL
2
+
3
+ * http://github.com/dekellum/human-ql
4
+ * http://rdoc.gravitext.com/human-ql/
5
+
6
+ == Description
7
+
8
+ Human Query Language for full text search engines. Provides a lenient
9
+ parser and associated tools for a self-contained and search-engine
10
+ agnostic query language suitable for use by end users. Lenient in that
11
+ is will produce a parse tree for any input, given a default operator
12
+ and by generally ignoring any unparsable syntax. Suitable for use by
13
+ end users in that it supports potentially several operator variants
14
+ and a query language not unlike some major web search and other
15
+ commercial search engines.
16
+
17
+ The query language supports the following features at a high level:
18
+
19
+ * Boolean operators: AND (infix), OR (infix), NOT (prefix) with an
20
+ implied default operator and precedence rules,
21
+ e.g. "boy or girl -infant"
22
+
23
+ * Optional parenthesis for explicitly denoting precedence.
24
+
25
+ * Quoted phrases (for proximity matching)
26
+
27
+ * Declarable prefix scopes, e.g. "TITLE:(car or bike)"
28
+
29
+ The main components are each highly customizable:
30
+
31
+ HumanQL::QueryParser — Parses any arbitrary input string and outputs an
32
+ Abstract Syntax Tree (AST)
33
+
34
+ HumanQL::TreeNormalizer — Normalizes and imposes limits on an AST,
35
+ e.g. avoids pathological queries.
36
+
37
+ HumanQL::QueryGenerator — Given an AST, generates a Human Query
38
+ Language string.
39
+
40
+ HumanQL::PostgreSQLGenerator — Given an AST, generate strings suitable
41
+ for passing to PostgreSQL's to_tsquery function.
42
+
43
+ Other generators are possible.
44
+
45
+ == License
46
+
47
+ Copyright (c) 2016 David Kellum
48
+
49
+ Licensed under the Apache License, Version 2.0 (the "License"); you
50
+ may not use this file except in compliance with the License. You
51
+ may obtain a copy of the License at:
52
+
53
+ http://www.apache.org/licenses/LICENSE-2.0
54
+
55
+ Unless required by applicable law or agreed to in writing, software
56
+ distributed under the License is distributed on an "AS IS" BASIS,
57
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
58
+ implied. See the License for the specific language governing
59
+ permissions and limitations under the License.
@@ -0,0 +1,14 @@
1
+ # -*- ruby -*-
2
+
3
+ require 'rubygems'
4
+ require 'bundler/setup'
5
+ require 'rjack-tarpit'
6
+
7
+ RJack::TarPit.new( 'human-ql' ).define_tasks
8
+
9
+ desc "Upload RDOC to Amazon S3 (rdoc.gravitext.com/human-ql, Oregon)"
10
+ task :publish_rdoc => [ :clean, :rerdoc ] do
11
+ sh <<-SH
12
+ aws s3 sync --acl public-read doc/ s3://rdoc.gravitext.com/human-ql/
13
+ SH
14
+ end
@@ -0,0 +1,17 @@
1
+ #--
2
+ # Copyright (c) 2016 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'human-ql/base'
@@ -0,0 +1,21 @@
1
+ #--
2
+ # Copyright (c) 2016 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ module HumanQL
18
+
19
+ VERSION='1.0.0'
20
+
21
+ end
@@ -0,0 +1,67 @@
1
+ #--
2
+ # Copyright (c) 2016 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'human-ql/query_parser'
18
+
19
+ module HumanQL
20
+
21
+ # Extends the generic QueryParser with additional special character
22
+ # filtering so as to avoid syntax errors in PostgreSQL to_tsquery()
23
+ # for any known input. Note that this is still a parser for the
24
+ # HumanQL query language, not anything implemented in PostgreSQL.
25
+ class PostgreSQLCustomParser < QueryParser
26
+
27
+ # Construct given options to set via base clase or as specified
28
+ # below.
29
+ #
30
+ # === Options
31
+ #
32
+ # :pg_version:: A version string ("9.5.5", "9.6.1") or integer
33
+ # array ( [9,6,1]) indicating the target PostgreSQL
34
+ # version. Phrase support starts in 9.6 so quoted
35
+ # phrases are ignored before that.
36
+ #
37
+ def initialize(opts = {})
38
+ opts = opts.dup
39
+ pg_version = opts.delete(:pg_version)
40
+ if pg_version.is_a?( String )
41
+ pg_version = pg_version.split( '.' ).map( &:to_i )
42
+ end
43
+ pg_version ||= []
44
+
45
+ super
46
+
47
+ # Phrase support starts in 9.6
48
+ if ( pg_version <=> [9,6] ) >= 0
49
+ @term_rejects = /[()|&:*!'<>]/
50
+ else
51
+ # Disable quote tokens and reject DQUOTE as token character
52
+ self.lquote = nil
53
+ self.rquote = nil
54
+ @term_rejects = /[()|&:*!'"<>]/
55
+ end
56
+
57
+ end
58
+
59
+ # Replace term_rejects characters with '_' which is punctuation
60
+ # (or effectively, whitespace) in tsquery with tested
61
+ # dictionaries.
62
+ def norm_term( t )
63
+ t.gsub( @term_rejects, '_' )
64
+ end
65
+ end
66
+
67
+ end
@@ -0,0 +1,83 @@
1
+ #--
2
+ # Copyright (c) 2016 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ module HumanQL
18
+
19
+ # Generate query strings suitable for passing to PostgreSQL's
20
+ # to_tsquery function, from a HumanQL abstract syntax tree (AST).
21
+ #
22
+ # In order to guarantee valid output for any human input, the AST
23
+ # should be created using PostgreSQLCustomParser and normalized via
24
+ # TreeNormalizer (with minimal defaults).
25
+ #
26
+ # Any scope's provided in the parser should have been handled and
27
+ # stripped out of the AST, as PostgreSQL is not expected to have a
28
+ # direct equivalent in tsquery syntax.
29
+ class PostgreSQLGenerator
30
+
31
+ #--
32
+ # From https://www.postgresql.org/docs/9.6/static/datatype-textsearch.html
33
+ # > In the absence of parentheses, '!' (NOT) binds most tightly,
34
+ # > and '&' (AND) and '<->' (FOLLOWED BY) both bind more tightly
35
+ # > than | (OR).
36
+ #++
37
+
38
+ AND = ' & '.freeze
39
+ OR = ' | '.freeze
40
+ NOT = '!'.freeze
41
+ NEAR = ' <-> '.freeze
42
+
43
+ # Given the root node of the AST, return a string in PostgreSQL
44
+ # tsquery syntax.
45
+ def generate( node )
46
+ op,*args = node
47
+ if ! node.is_a?( Array )
48
+ op
49
+ elsif args.empty?
50
+ nil
51
+ else
52
+ case op
53
+ when :and
54
+ terms_join( args, AND )
55
+ when :or
56
+ pwrap( terms_join( args, OR ) )
57
+ when :not
58
+ if args[0].is_a?( Array )
59
+ NOT + pwrap( generate( args[0] ) )
60
+ else
61
+ NOT + args[0]
62
+ end
63
+ when :phrase
64
+ terms_join( args, NEAR )
65
+ else
66
+ raise "Unsupported op: #{node.inspect}"
67
+ end
68
+ end
69
+ end
70
+
71
+ protected
72
+
73
+ def terms_join( args, op )
74
+ args.map { |a| generate( a ) }.join( op )
75
+ end
76
+
77
+ def pwrap( inner )
78
+ '(' + inner + ')'
79
+ end
80
+
81
+ end
82
+
83
+ end
@@ -0,0 +1,163 @@
1
+ #--
2
+ # Copyright (c) 2016 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ module HumanQL
18
+
19
+ # Generate a Human Query Language String from an abstract syntax
20
+ # tree (AST). This allows query simplification (e.g. via
21
+ # TreeNormalizer) and re-writing queries.
22
+ class QueryGenerator
23
+
24
+ # The AND operator (if not default).
25
+ # Default: ' and '
26
+ attr_accessor :and
27
+
28
+ # The OR operator (if not default).
29
+ # Default: ' or '
30
+ attr_accessor :or
31
+
32
+ # The NOT operator.
33
+ # Default: '-'
34
+ attr_accessor :not
35
+
36
+ # SPACE delimiter.
37
+ # Default: ' '
38
+ attr_accessor :space
39
+
40
+ # Left quote character for phrases.
41
+ # Default: '"'
42
+ attr_accessor :lquote
43
+
44
+ # Right quote character for phrases.
45
+ # Default: '"'
46
+ attr_accessor :rquote
47
+
48
+ # COLON character used a prefix delimiter.
49
+ # Default: ':'
50
+ attr_accessor :colon
51
+
52
+ # Left parenthesis character.
53
+ # Default: '('
54
+ attr_accessor :lparen
55
+
56
+ # Right parenthesis character.
57
+ # Default: ')'
58
+ attr_accessor :rparen
59
+
60
+ # The default operator (:and or :or). If set, will output a :space
61
+ # instead of the operator.
62
+ # Default: nil
63
+ attr_accessor :default_op
64
+
65
+ # Hash of operators to precedence integer values, as per
66
+ # QueryParser#precedence. If set, outputs parentheses only when
67
+ # precedence dictates that it is necessary.
68
+ # Default: nil
69
+ attr_accessor :precedence
70
+
71
+ # Set #default_op and #precedence from the given QueryParser, as a
72
+ # convenience.
73
+ def parser=( p )
74
+ @default_op = p.default_op
75
+ @precedence = p.precedence
76
+ end
77
+
78
+ # Construct given options which are interpreted as attribute names
79
+ # to set.
80
+ def initialize( opts = {} )
81
+ @and = ' and '.freeze
82
+ @or = ' or '.freeze
83
+ @not = '-'.freeze
84
+ @space = ' '.freeze
85
+ @lquote = @rquote = '"'.freeze
86
+ @colon = ':'.freeze
87
+ @lparen = '('.freeze
88
+ @rparen = ')'.freeze
89
+ @default_op = nil
90
+ @precedence = nil
91
+
92
+ opts.each do |name,val|
93
+ send( name.to_s + '=', val )
94
+ end
95
+ end
96
+
97
+ # Given the root node of the AST, return a String in Human Query
98
+ # Language syntax.
99
+ def generate( node )
100
+ op,*args = node
101
+ if ! node.is_a?( Array )
102
+ op
103
+ elsif args.empty?
104
+ nil
105
+ else
106
+ case op
107
+ when :and
108
+ terms_join( args, :and )
109
+ when :or
110
+ terms_join( args, :or )
111
+ when :not
112
+ @not + pwrap_gen( args[0], op )
113
+ when :phrase
114
+ @lquote + args.join( @space ) + @rquote
115
+ when String
116
+ op + @colon + pwrap_gen( args[0], op )
117
+ else
118
+ raise "Unsupported op: #{node.inspect}"
119
+ end
120
+ end
121
+ end
122
+
123
+ protected
124
+
125
+ def terms_join( args, op )
126
+ args = args.map { |a| pwrap_gen( a, op ) }
127
+ if op == @default_op
128
+ args.join( @space )
129
+ elsif op == :and
130
+ args.join( @and )
131
+ elsif op == :or
132
+ args.join( @or )
133
+ end
134
+ end
135
+
136
+ def pwrap_gen( node, parent_op )
137
+ if node.is_a?( Array )
138
+ op = node[0]
139
+ if precedence_lte?( parent_op, op )
140
+ generate( node )
141
+ else
142
+ pwrap( generate( node ) )
143
+ end
144
+ else
145
+ node
146
+ end
147
+ end
148
+
149
+ def pwrap( inner )
150
+ @lparen + inner + @rparen
151
+ end
152
+
153
+ def precedence_lte?( op1, op2 )
154
+ if @precedence
155
+ @precedence[op1] <= @precedence[op2]
156
+ else
157
+ false
158
+ end
159
+ end
160
+
161
+ end
162
+
163
+ end