human_query_parser 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.rubocop.yml +131 -0
- data/CHANGELOG.md +4 -0
- data/CODE_OF_CONDUCT.md +46 -0
- data/Gemfile +5 -0
- data/Guardfile +22 -0
- data/Jenkinsfile +92 -0
- data/LICENSE +21 -0
- data/README.md +261 -0
- data/Rakefile +10 -0
- data/bin/_guard-core +16 -0
- data/bin/guard +16 -0
- data/bin/rake +16 -0
- data/human_query_parser.gemspec +29 -0
- data/lib/human_query_parser.rb +14 -0
- data/lib/human_query_parser/bareword.rb +63 -0
- data/lib/human_query_parser/parser.rb +26 -0
- data/lib/human_query_parser/phrase.rb +36 -0
- data/lib/human_query_parser/query.rb +46 -0
- data/lib/human_query_parser/term.rb +22 -0
- data/lib/human_query_parser/transform.rb +14 -0
- data/lib/human_query_parser/version.rb +3 -0
- data/test/bareword_test.rb +52 -0
- data/test/human_query_parser_test.rb +18 -0
- data/test/parser_test.rb +136 -0
- data/test/phrase_test.rb +33 -0
- data/test/query_test.rb +159 -0
- data/test/term_test.rb +16 -0
- data/test/test_helper.rb +18 -0
- data/test/transform_test.rb +185 -0
- metadata +167 -0
data/Rakefile
ADDED
data/bin/_guard-core
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application '_guard-core' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
+
Pathname.new(__FILE__).realpath,)
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bundler/setup'
|
15
|
+
|
16
|
+
load Gem.bin_path('guard', '_guard-core')
|
data/bin/guard
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'guard' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
+
Pathname.new(__FILE__).realpath,)
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bundler/setup'
|
15
|
+
|
16
|
+
load Gem.bin_path('guard', 'guard')
|
data/bin/rake
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'rake' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
+
Pathname.new(__FILE__).realpath,)
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bundler/setup'
|
15
|
+
|
16
|
+
load Gem.bin_path('rake', 'rake')
|
@@ -0,0 +1,29 @@
|
|
1
|
+
lib = File.expand_path('../lib', __FILE__)
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
|
4
|
+
require 'human_query_parser/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = 'human_query_parser'
|
8
|
+
spec.version = HumanQueryParser::VERSION
|
9
|
+
spec.authors = ['PatientsLikeMe']
|
10
|
+
spec.email = ['engineers@patientslikeme.com']
|
11
|
+
spec.homepage = 'https://www.patientslikeme.com'
|
12
|
+
|
13
|
+
spec.summary = 'A tool for taking search queries of the form most users will expect, and producing ElasticSearch queries that do what most users would expect.'
|
14
|
+
spec.description = 'A tool for taking search queries of the form most users will expect, and producing ElasticSearch queries that do what most users would expect.'
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.bindir = 'exe'
|
18
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
19
|
+
spec.test_files = spec.files.grep(%r{^test/})
|
20
|
+
spec.require_paths = ['lib']
|
21
|
+
|
22
|
+
spec.add_runtime_dependency 'parslet', '~> 1.8'
|
23
|
+
|
24
|
+
spec.add_development_dependency 'bundler', '~> 1.10'
|
25
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
26
|
+
spec.add_development_dependency 'minitest'
|
27
|
+
spec.add_development_dependency 'minitest-reporters'
|
28
|
+
spec.add_development_dependency 'pry'
|
29
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'human_query_parser/parser'
|
2
|
+
require 'human_query_parser/bareword'
|
3
|
+
require 'human_query_parser/phrase'
|
4
|
+
require 'human_query_parser/query'
|
5
|
+
require 'human_query_parser/term'
|
6
|
+
require 'human_query_parser/transform'
|
7
|
+
|
8
|
+
module HumanQueryParser
|
9
|
+
def self.compile(query_text, search_fields)
|
10
|
+
parse_result = HumanQueryParser::Parser.new.parse(query_text)
|
11
|
+
query = HumanQueryParser::Transform.new.apply(parse_result)
|
12
|
+
query.es_query(search_fields)
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module HumanQueryParser
|
2
|
+
class Bareword
|
3
|
+
attr_reader :content
|
4
|
+
|
5
|
+
def initialize(content)
|
6
|
+
@content = content.to_s
|
7
|
+
end
|
8
|
+
|
9
|
+
def query_fragments(search_fields, fuzzy)
|
10
|
+
if fuzzy
|
11
|
+
[
|
12
|
+
{
|
13
|
+
multi_match: {
|
14
|
+
fields: search_fields,
|
15
|
+
query: content,
|
16
|
+
max_expansions: 50,
|
17
|
+
fuzziness: "AUTO",
|
18
|
+
prefix_length: 1,
|
19
|
+
},
|
20
|
+
},
|
21
|
+
{
|
22
|
+
multi_match: {
|
23
|
+
fields: search_fields,
|
24
|
+
query: content,
|
25
|
+
max_expansions: 50,
|
26
|
+
fuzziness: "AUTO",
|
27
|
+
operator: 'and',
|
28
|
+
boost: 6.0,
|
29
|
+
prefix_length: 1,
|
30
|
+
},
|
31
|
+
},
|
32
|
+
{
|
33
|
+
multi_match: {
|
34
|
+
fields: search_fields,
|
35
|
+
query: content,
|
36
|
+
max_expansions: 50,
|
37
|
+
type: "phrase",
|
38
|
+
boost: 8.0,
|
39
|
+
},
|
40
|
+
},
|
41
|
+
{
|
42
|
+
multi_match: {
|
43
|
+
fields: search_fields,
|
44
|
+
query: content,
|
45
|
+
max_expansions: 50,
|
46
|
+
fuzziness: "AUTO",
|
47
|
+
prefix_length: 3,
|
48
|
+
},
|
49
|
+
},
|
50
|
+
]
|
51
|
+
else
|
52
|
+
[
|
53
|
+
{
|
54
|
+
multi_match: {
|
55
|
+
fields: search_fields,
|
56
|
+
query: content,
|
57
|
+
},
|
58
|
+
},
|
59
|
+
]
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'parslet'
|
2
|
+
|
3
|
+
module HumanQueryParser
|
4
|
+
class Parser < Parslet::Parser
|
5
|
+
# Single character rules
|
6
|
+
rule(:plus) { str('+') }
|
7
|
+
rule(:minus) { str('-') }
|
8
|
+
rule(:quote) { str('"') }
|
9
|
+
|
10
|
+
rule(:space) { match('\s').repeat(1) }
|
11
|
+
rule(:space?) { space.maybe }
|
12
|
+
|
13
|
+
# Things
|
14
|
+
rule(:operator) { plus | minus }
|
15
|
+
rule(:phrase) {
|
16
|
+
quote >> (quote.absent? >> any).repeat.as(:phrase) >> quote
|
17
|
+
}
|
18
|
+
rule(:bareword_start) { quote.absent? >> any }
|
19
|
+
rule(:bareword) { (bareword_start >> (space.absent? >> any).repeat).as(:bareword) }
|
20
|
+
rule(:term) { space? >> operator.maybe.as(:operator) >> (phrase | bareword).as(:term) >> space? }
|
21
|
+
|
22
|
+
# Put it all together
|
23
|
+
rule(:query) { term.repeat.as(:query) }
|
24
|
+
root :query
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module HumanQueryParser
|
2
|
+
class Phrase
|
3
|
+
attr_reader :content
|
4
|
+
|
5
|
+
def initialize(content)
|
6
|
+
@content = content.to_s
|
7
|
+
end
|
8
|
+
|
9
|
+
def query_fragments(search_fields, fuzzy)
|
10
|
+
multi_match = {
|
11
|
+
fields: search_fields,
|
12
|
+
query: content,
|
13
|
+
type: "phrase",
|
14
|
+
}
|
15
|
+
|
16
|
+
if fuzzy
|
17
|
+
[
|
18
|
+
{
|
19
|
+
function_score: {
|
20
|
+
query: {
|
21
|
+
multi_match: multi_match.merge(max_expansions: 50),
|
22
|
+
},
|
23
|
+
boost: 8.0,
|
24
|
+
},
|
25
|
+
},
|
26
|
+
]
|
27
|
+
else
|
28
|
+
[
|
29
|
+
{
|
30
|
+
multi_match: multi_match,
|
31
|
+
},
|
32
|
+
]
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'human_query_parser/bareword'
|
2
|
+
require 'human_query_parser/term'
|
3
|
+
|
4
|
+
module HumanQueryParser
|
5
|
+
class Query
|
6
|
+
attr_reader :terms_by_operator
|
7
|
+
|
8
|
+
def initialize(terms)
|
9
|
+
@terms_by_operator = terms.group_by(&:operator).inject({}) do |hash, (operator, term_group)|
|
10
|
+
hash[operator] = combine_barewords(term_group, operator)
|
11
|
+
hash
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def combine_barewords(terms, operator)
|
16
|
+
bareword_terms, others = terms.partition(&:bareword?)
|
17
|
+
|
18
|
+
if bareword_terms.any?
|
19
|
+
strings = bareword_terms.map { |term| term.content.content }
|
20
|
+
new_bareword = Bareword.new(strings.join(" "))
|
21
|
+
[Term.new(operator, new_bareword)] + others
|
22
|
+
else
|
23
|
+
others
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def terms_for_operator(operator)
|
28
|
+
terms_by_operator[operator] || []
|
29
|
+
end
|
30
|
+
|
31
|
+
def es_query(search_fields)
|
32
|
+
bool_clauses = terms_by_operator.inject({}) do |hash, (operator, terms)|
|
33
|
+
es_operator = case operator
|
34
|
+
when nil then :should
|
35
|
+
when '+' then :must
|
36
|
+
when '-' then :must_not
|
37
|
+
end
|
38
|
+
|
39
|
+
hash[es_operator] = terms.flat_map { |term| term.query_fragments(search_fields) }
|
40
|
+
hash
|
41
|
+
end
|
42
|
+
|
43
|
+
{ bool: bool_clauses }
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module HumanQueryParser
|
2
|
+
class Term
|
3
|
+
attr_reader :operator, :content
|
4
|
+
|
5
|
+
def initialize(operator, content)
|
6
|
+
@operator = operator
|
7
|
+
@content = content
|
8
|
+
end
|
9
|
+
|
10
|
+
def bareword?
|
11
|
+
content.is_a?(Bareword)
|
12
|
+
end
|
13
|
+
|
14
|
+
def fuzzy?
|
15
|
+
operator.nil?
|
16
|
+
end
|
17
|
+
|
18
|
+
def query_fragments(search_fields)
|
19
|
+
content.query_fragments(search_fields, fuzzy?)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'parslet'
|
2
|
+
require 'human_query_parser/bareword'
|
3
|
+
require 'human_query_parser/phrase'
|
4
|
+
require 'human_query_parser/query'
|
5
|
+
require 'human_query_parser/term'
|
6
|
+
|
7
|
+
module HumanQueryParser
|
8
|
+
class Transform < Parslet::Transform
|
9
|
+
rule(:phrase => simple(:phrase)) { Phrase.new(phrase) }
|
10
|
+
rule(:bareword => simple(:bareword)) { Bareword.new(bareword) }
|
11
|
+
rule(:term => simple(:term), :operator => simple(:operator)) { Term.new(operator, term) }
|
12
|
+
rule(:query => sequence(:terms)) { Query.new(terms) }
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
class HumanQueryParser::BarewordTest < Minitest::Spec
|
5
|
+
it 'generates a non-fuzzy query fragment correctly' do
|
6
|
+
HumanQueryParser::Bareword.new('blue').query_fragments(['field1', 'field2'], false).must_equal([
|
7
|
+
{
|
8
|
+
multi_match: {
|
9
|
+
fields: ['field1', 'field2'],
|
10
|
+
query: 'blue',
|
11
|
+
},
|
12
|
+
},
|
13
|
+
],)
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'generates a fuzzy query fragment correctly' do
|
17
|
+
actual_fragments = HumanQueryParser::Bareword.new('blue').query_fragments(['field1', 'field2'], true)
|
18
|
+
|
19
|
+
basic_multi_match = {
|
20
|
+
fields: ['field1', 'field2'],
|
21
|
+
query: 'blue',
|
22
|
+
max_expansions: 50,
|
23
|
+
}
|
24
|
+
|
25
|
+
expected_fragments = [
|
26
|
+
{ multi_match: basic_multi_match.merge(fuzziness: "AUTO", prefix_length: 1) },
|
27
|
+
{
|
28
|
+
multi_match: basic_multi_match.merge({
|
29
|
+
operator: "and",
|
30
|
+
fuzziness: "AUTO",
|
31
|
+
prefix_length: 1,
|
32
|
+
boost: 6.0,
|
33
|
+
},),
|
34
|
+
},
|
35
|
+
{
|
36
|
+
multi_match: basic_multi_match.merge(type: 'phrase', boost: 8.0),
|
37
|
+
},
|
38
|
+
{ multi_match: basic_multi_match.merge(fuzziness: "AUTO", prefix_length: 1) },
|
39
|
+
]
|
40
|
+
|
41
|
+
actual_fragments.size.must_equal expected_fragments.size
|
42
|
+
expected_fragments.each do |fragment|
|
43
|
+
assert actual_fragments.include?(fragment), <<-MESSAGE
|
44
|
+
Generated query fragments:
|
45
|
+
#{JSON.pretty_generate(actual_fragments)}
|
46
|
+
|
47
|
+
Were expected to contain the following fragment, but didn't:
|
48
|
+
#{JSON.pretty_generate(fragment)}
|
49
|
+
MESSAGE
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require_relative 'test_helper'
|
2
|
+
|
3
|
+
class HumanQueryParserTest < Minitest::Spec
|
4
|
+
it 'compiles things, basically' do
|
5
|
+
HumanQueryParser.compile('+test', ['field1', 'field2']).must_equal({
|
6
|
+
bool: {
|
7
|
+
must: [
|
8
|
+
{
|
9
|
+
multi_match: {
|
10
|
+
fields: ['field1', 'field2'],
|
11
|
+
query: 'test',
|
12
|
+
},
|
13
|
+
},
|
14
|
+
],
|
15
|
+
},
|
16
|
+
},)
|
17
|
+
end
|
18
|
+
end
|
data/test/parser_test.rb
ADDED
@@ -0,0 +1,136 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class HumanQueryParser::ParserTest < Minitest::Spec
|
4
|
+
subject { HumanQueryParser::Parser.new }
|
5
|
+
|
6
|
+
it 'parses a single term correctly' do
|
7
|
+
subject.parse('word').must_equal({
|
8
|
+
query: [
|
9
|
+
{
|
10
|
+
operator: nil,
|
11
|
+
term: {
|
12
|
+
bareword: 'word',
|
13
|
+
},
|
14
|
+
},
|
15
|
+
],
|
16
|
+
},)
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'parses two terms correctly' do
|
20
|
+
subject.parse('word up').must_equal({
|
21
|
+
query: [
|
22
|
+
{
|
23
|
+
operator: nil,
|
24
|
+
term: {
|
25
|
+
bareword: 'word',
|
26
|
+
},
|
27
|
+
},
|
28
|
+
{
|
29
|
+
operator: nil,
|
30
|
+
term: {
|
31
|
+
bareword: 'up',
|
32
|
+
},
|
33
|
+
},
|
34
|
+
],
|
35
|
+
},)
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'ignores extra spacing' do
|
39
|
+
subject.parse(' word up ').must_equal({
|
40
|
+
query: [
|
41
|
+
{
|
42
|
+
operator: nil,
|
43
|
+
term: {
|
44
|
+
bareword: 'word',
|
45
|
+
},
|
46
|
+
},
|
47
|
+
{
|
48
|
+
operator: nil,
|
49
|
+
term: {
|
50
|
+
bareword: 'up',
|
51
|
+
},
|
52
|
+
},
|
53
|
+
],
|
54
|
+
},)
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'parses terms with operators correctly' do
|
58
|
+
subject.parse('+word').must_equal({
|
59
|
+
query: [
|
60
|
+
{
|
61
|
+
operator: '+',
|
62
|
+
term: {
|
63
|
+
bareword: 'word',
|
64
|
+
},
|
65
|
+
},
|
66
|
+
],
|
67
|
+
},)
|
68
|
+
|
69
|
+
subject.parse('-word').must_equal({
|
70
|
+
query: [
|
71
|
+
{
|
72
|
+
operator: '-',
|
73
|
+
term: {
|
74
|
+
bareword: 'word',
|
75
|
+
},
|
76
|
+
},
|
77
|
+
],
|
78
|
+
},)
|
79
|
+
end
|
80
|
+
|
81
|
+
it 'parses quoted phrases correctly' do
|
82
|
+
subject.parse('word "a phrase"').must_equal({
|
83
|
+
query: [
|
84
|
+
{
|
85
|
+
operator: nil,
|
86
|
+
term: {
|
87
|
+
bareword: 'word',
|
88
|
+
},
|
89
|
+
},
|
90
|
+
{
|
91
|
+
operator: nil,
|
92
|
+
term: {
|
93
|
+
phrase: 'a phrase',
|
94
|
+
},
|
95
|
+
},
|
96
|
+
],
|
97
|
+
},)
|
98
|
+
end
|
99
|
+
|
100
|
+
it 'passes through extra spacing in phrases' do
|
101
|
+
subject.parse('"a phrase"').must_equal({
|
102
|
+
query: [
|
103
|
+
{
|
104
|
+
operator: nil,
|
105
|
+
term: {
|
106
|
+
phrase: 'a phrase',
|
107
|
+
},
|
108
|
+
},
|
109
|
+
],
|
110
|
+
},)
|
111
|
+
end
|
112
|
+
|
113
|
+
it 'parses phrases with operators correctly' do
|
114
|
+
subject.parse('+"a phrase"').must_equal({
|
115
|
+
query: [
|
116
|
+
{
|
117
|
+
operator: '+',
|
118
|
+
term: {
|
119
|
+
phrase: 'a phrase',
|
120
|
+
},
|
121
|
+
},
|
122
|
+
],
|
123
|
+
},)
|
124
|
+
|
125
|
+
subject.parse('-"a phrase"').must_equal({
|
126
|
+
query: [
|
127
|
+
{
|
128
|
+
operator: '-',
|
129
|
+
term: {
|
130
|
+
phrase: 'a phrase',
|
131
|
+
},
|
132
|
+
},
|
133
|
+
],
|
134
|
+
},)
|
135
|
+
end
|
136
|
+
end
|