answerific 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +5 -7
- data/Rakefile +2 -0
- data/lib/answerific/miner.rb +174 -0
- data/lib/answerific/version.rb +1 -1
- data/lib/answerific.rb +1 -161
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e81fef1ef9b77e8823ea1eea1f157e52c80af930
|
|
4
|
+
data.tar.gz: 9e806579b3bc22837c58daf63a4200eef9a698dd
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: a5d1fd241089b7f2bd3feea69a193361684734bdd5012cbc8aee7992ac9d6fa2fb1081a2b7e3bd6d0a3b9d3a5d0bb10990262b2cf44b33dbade31e64c8635b88
|
|
7
|
+
data.tar.gz: a84357bd7aa62b49affcb427c7d5e1256bbb7fe1529b498cccb7a50f9694705aa2e5ac5168b0202f8ebde3d2dc9b22252fe19ad45f52546585ab633fd1a647cc
|
data/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Answerific
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Mining bot that can answer natural language questions by mining the web.
|
|
4
4
|
|
|
5
5
|
## Installation
|
|
6
6
|
|
|
@@ -20,7 +20,7 @@ Or install it yourself as:
|
|
|
20
20
|
|
|
21
21
|
## Usage
|
|
22
22
|
|
|
23
|
-
bot = Answerific::
|
|
23
|
+
bot = Answerific::Miner.new()
|
|
24
24
|
bot.answer('what is the composition of Pluto?')
|
|
25
25
|
|
|
26
26
|
## How it works
|
|
@@ -35,11 +35,9 @@ Given an input, answerific will
|
|
|
35
35
|
|
|
36
36
|
## Roadmap
|
|
37
37
|
|
|
38
|
-
* Add options at initialization
|
|
39
|
-
*
|
|
40
|
-
*
|
|
41
|
-
* Better support for wh-words (atm, the bot just gets rid of them)
|
|
42
|
-
* Better support for yes-no questions: answer with definite yes-no instead of statement
|
|
38
|
+
* [ ] Add options at initialization
|
|
39
|
+
* [ ] Better support for wh-words (atm, the bot just gets rid of them)
|
|
40
|
+
* [ ] Better support for yes-no questions: answer with definite yes-no instead of statement
|
|
43
41
|
|
|
44
42
|
## Development
|
|
45
43
|
|
data/Rakefile
CHANGED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
require 'cgi'
|
|
2
|
+
|
|
3
|
+
# Miner bot that answers questions by extracting information from the web
|
|
4
|
+
# Currently only supports Google Search
|
|
5
|
+
class Answerific::Miner
|
|
6
|
+
|
|
7
|
+
# Answers `question` by querying Google
|
|
8
|
+
# Assumes `question` is downcase, only contains alpha numeric characters
|
|
9
|
+
# (i.e. has been preprocessed by Answerific::Bot.preprocess)
|
|
10
|
+
# Returns a string containing the response or nil if none is found
|
|
11
|
+
def answer(question)
|
|
12
|
+
p 'Answering ' + question
|
|
13
|
+
return nil if !question || question.empty?
|
|
14
|
+
mine(parse(preprocess(question)))
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# === SELECT RESPONSE ===
|
|
18
|
+
|
|
19
|
+
def process_google_results(results, query)
|
|
20
|
+
candidates = select_responses(results, query)
|
|
21
|
+
select_best_response(candidates)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Returns a single response from the list of responses
|
|
25
|
+
# TODO how to select the best? right now, return the first one
|
|
26
|
+
def select_best_response(responses)
|
|
27
|
+
responses.sample
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Returns the responses from `results` that have a the words in `query`
|
|
31
|
+
def select_responses(results, query)
|
|
32
|
+
sentences = results.map { |r| split_at_dot(r) }.flatten
|
|
33
|
+
query_words = query.split ' '
|
|
34
|
+
|
|
35
|
+
# Select the responses, only keeping the sentence that contain the search query
|
|
36
|
+
selected = sentences.select do |sentence|
|
|
37
|
+
query_words.all? { |w| sentence.include? w } # contains all query words
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
return selected
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# === EXTRACT INFO ===
|
|
44
|
+
|
|
45
|
+
def mine(query)
|
|
46
|
+
results = []
|
|
47
|
+
|
|
48
|
+
Google::Search::Web.new(query: query).each do |r|
|
|
49
|
+
results << clean_google_result(r.content)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
process_google_results(results, query)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# === PARSE AND REARRANGE === (prepare for search engines)
|
|
56
|
+
|
|
57
|
+
def parse(question)
|
|
58
|
+
type = broad_question_type question
|
|
59
|
+
parsed = ''
|
|
60
|
+
|
|
61
|
+
case type
|
|
62
|
+
when 'wh'
|
|
63
|
+
parsed = parse_wh_question question
|
|
64
|
+
when 'yes-no'
|
|
65
|
+
parsed = parse_yes_no_question question
|
|
66
|
+
when 'declarative'
|
|
67
|
+
parsed = parse_declarative_question question
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
return parsed
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# TODO consider verb permutations
|
|
74
|
+
# TODO consider wh-word: where is the sun => the sun is [located]
|
|
75
|
+
# Parses the wh-question `question` by removing the wh-word and moving the main verb at the end
|
|
76
|
+
# Assumptions:
|
|
77
|
+
# * wh-word is at the beginning
|
|
78
|
+
# * main verb follows the wh-word
|
|
79
|
+
# (TODO not accurate for which/whose but should be ok for the others)
|
|
80
|
+
# Example:
|
|
81
|
+
# question: 'where is the Kuiper belt'
|
|
82
|
+
# returns : 'the Kuiper belt is'
|
|
83
|
+
def parse_wh_question(question)
|
|
84
|
+
words = question.split ' '
|
|
85
|
+
parsed = words[2..-1] << words[1]
|
|
86
|
+
parsed.join " "
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Returns an array of permutations of the main verb in the question without the wh-word
|
|
90
|
+
# Parses the wh-question `question` by removing the wh-word
|
|
91
|
+
# Assumptions:
|
|
92
|
+
# * wh-word is at the beginning
|
|
93
|
+
# * main verb follows the wh-word
|
|
94
|
+
# (TODO not accurate for which/whose but should be ok for the others)
|
|
95
|
+
# Example:
|
|
96
|
+
# question: 'where is the Kuiper belt'
|
|
97
|
+
# returns : ['is the Kuiper belt',
|
|
98
|
+
# 'the is Kuiper belt',
|
|
99
|
+
# 'the Kuiper is belt',
|
|
100
|
+
# 'the Kuiper belt is']
|
|
101
|
+
# def parse_wh_question(question)
|
|
102
|
+
|
|
103
|
+
# end
|
|
104
|
+
|
|
105
|
+
# Returns `question` without the yes-no verb
|
|
106
|
+
# Example:
|
|
107
|
+
# question: 'is pluto closer to the sun than saturn'
|
|
108
|
+
# returns : 'pluto closer to the sun than saturn'
|
|
109
|
+
def parse_yes_no_question(question)
|
|
110
|
+
words = question.split ' '
|
|
111
|
+
return words[1..-1].join ' '
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Returns `question` without the declarative statement
|
|
115
|
+
# Example:
|
|
116
|
+
# question: 'tell me what is Pluto'
|
|
117
|
+
# returns : 'what is Pluto'
|
|
118
|
+
def parse_declarative_question(question)
|
|
119
|
+
declarative_expressions = [ 'tell me', 'I want to know' ]
|
|
120
|
+
return question.gsub(/^#{Regexp.union(*declarative_expressions)}/, '').strip
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# === DETECT TYPE OF QUESTION ===
|
|
124
|
+
|
|
125
|
+
def broad_question_type(question)
|
|
126
|
+
return 'wh' if is_wh_question question
|
|
127
|
+
return 'yes-no' if is_yes_no_question question
|
|
128
|
+
return 'declarative'
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# Returns true if question starts with a wh-question word
|
|
132
|
+
def is_wh_question(question)
|
|
133
|
+
wh_words = %w(who where when why what which how)
|
|
134
|
+
return /^#{Regexp.union(*wh_words)}/ === question
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Returns true if question starts with a yes-no question expression
|
|
138
|
+
def is_yes_no_question(question)
|
|
139
|
+
yes_no_words = %w(am are is was were have has do does did can could should may)
|
|
140
|
+
return /^#{Regexp.union(*yes_no_words)}/ === question
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# === PREPROCESSING ===
|
|
144
|
+
|
|
145
|
+
# Returns cleaned `input`
|
|
146
|
+
def preprocess(input)
|
|
147
|
+
clean(input)
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Cleans the string `input` by removing non alpha-numeric characters
|
|
151
|
+
def clean(input)
|
|
152
|
+
ret = input.downcase
|
|
153
|
+
ret.gsub(/[^0-9a-z ]/i, '').strip
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# === OTHER FORMATTING ===
|
|
157
|
+
|
|
158
|
+
def clean_google_result(string)
|
|
159
|
+
string = CGI.unescapeHTML(string)
|
|
160
|
+
string
|
|
161
|
+
.downcase
|
|
162
|
+
.gsub(/[^\.]+\.{3,}/, '') # remove incomplete sentences
|
|
163
|
+
.gsub(/<("[^"]*"|'[^']*'|[^'">])*>/, '') # html tags
|
|
164
|
+
.gsub(/\w{3} \d{1,2}, \d{4} \.{3} /, '') # dates (27 Jan, 2015)
|
|
165
|
+
.gsub("\n",'') # new lines
|
|
166
|
+
.strip
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def split_at_dot(string)
|
|
170
|
+
# matches NUM. or ALPHAALPHA.
|
|
171
|
+
re = /([0-9]|[a-z]{2})[\.\?!] ?/i
|
|
172
|
+
string.split(re).each_slice(2).map(&:join)
|
|
173
|
+
end
|
|
174
|
+
end
|
data/lib/answerific/version.rb
CHANGED
data/lib/answerific.rb
CHANGED
|
@@ -1,166 +1,6 @@
|
|
|
1
1
|
require "answerific/version"
|
|
2
|
+
require "answerific/miner.rb"
|
|
2
3
|
require "google-search"
|
|
3
4
|
|
|
4
5
|
module Answerific
|
|
5
|
-
class Bot
|
|
6
|
-
|
|
7
|
-
def answer(question)
|
|
8
|
-
mine(parse(preprocess(question)))
|
|
9
|
-
end
|
|
10
|
-
|
|
11
|
-
# === SELECT RESPONSE ===
|
|
12
|
-
|
|
13
|
-
def process_google_results(results, query)
|
|
14
|
-
candidates = select_responses(results, query)
|
|
15
|
-
select_best_response(candidates)
|
|
16
|
-
end
|
|
17
|
-
|
|
18
|
-
# Returns a single response from the list of responses
|
|
19
|
-
# TODO how to select the best? right now, return the first one
|
|
20
|
-
def select_best_response(responses)
|
|
21
|
-
responses.sample
|
|
22
|
-
end
|
|
23
|
-
|
|
24
|
-
# Returns the responses from `results` that have a the words in `query`
|
|
25
|
-
def select_responses(results, query)
|
|
26
|
-
sentences = results.map { |r| split_at_dot r }.flatten
|
|
27
|
-
query_words = query.split ' '
|
|
28
|
-
|
|
29
|
-
# Select the responses, only keeping the sentence that contain the search query
|
|
30
|
-
selected = sentences.select do |sentence|
|
|
31
|
-
query_words.all? { |w| sentence.include? w } # contains all query words
|
|
32
|
-
end
|
|
33
|
-
|
|
34
|
-
return selected
|
|
35
|
-
end
|
|
36
|
-
|
|
37
|
-
# === EXTRACT INFO ===
|
|
38
|
-
|
|
39
|
-
def mine(query)
|
|
40
|
-
results = []
|
|
41
|
-
|
|
42
|
-
Google::Search::Web.new(query: query).each do |r|
|
|
43
|
-
results << clean_google_result(r.content)
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
process_google_results(results, query)
|
|
47
|
-
end
|
|
48
|
-
|
|
49
|
-
# === PARSE AND REARRANGE === (prepare for search engines)
|
|
50
|
-
|
|
51
|
-
def parse(question)
|
|
52
|
-
type = broad_question_type question
|
|
53
|
-
parsed = ''
|
|
54
|
-
|
|
55
|
-
case type
|
|
56
|
-
when 'wh'
|
|
57
|
-
parsed = parse_wh_question question
|
|
58
|
-
when 'yes-no'
|
|
59
|
-
parsed = parse_yes_no_question question
|
|
60
|
-
when 'declarative'
|
|
61
|
-
parsed = parse_declarative_question question
|
|
62
|
-
end
|
|
63
|
-
|
|
64
|
-
return parsed
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
# TODO consider verb permutations
|
|
68
|
-
# TODO consider wh-word: where is the sun => the sun is [located]
|
|
69
|
-
# Parses the wh-question `question` by removing the wh-word and moving the main verb at the end
|
|
70
|
-
# Assumptions:
|
|
71
|
-
# * wh-word is at the beginning
|
|
72
|
-
# * main verb follows the wh-word
|
|
73
|
-
# (TODO not accurate for which/whose but should be ok for the others)
|
|
74
|
-
# Example:
|
|
75
|
-
# question: 'where is the Kuiper belt'
|
|
76
|
-
# returns : 'the Kuiper belt is'
|
|
77
|
-
def parse_wh_question(question)
|
|
78
|
-
words = question.split ' '
|
|
79
|
-
parsed = words[2..-1] << words[1]
|
|
80
|
-
parsed.join " "
|
|
81
|
-
end
|
|
82
|
-
|
|
83
|
-
# Returns an array of permutations of the main verb in the question without the wh-word
|
|
84
|
-
# Parses the wh-question `question` by removing the wh-word
|
|
85
|
-
# Assumptions:
|
|
86
|
-
# * wh-word is at the beginning
|
|
87
|
-
# * main verb follows the wh-word
|
|
88
|
-
# (TODO not accurate for which/whose but should be ok for the others)
|
|
89
|
-
# Example:
|
|
90
|
-
# question: 'where is the Kuiper belt'
|
|
91
|
-
# returns : ['is the Kuiper belt',
|
|
92
|
-
# 'the is Kuiper belt',
|
|
93
|
-
# 'the Kuiper is belt',
|
|
94
|
-
# 'the Kuiper belt is']
|
|
95
|
-
# def parse_wh_question(question)
|
|
96
|
-
|
|
97
|
-
# end
|
|
98
|
-
|
|
99
|
-
# Returns `question` without the yes-no verb
|
|
100
|
-
# Example:
|
|
101
|
-
# question: 'is pluto closer to the sun than saturn'
|
|
102
|
-
# returns : 'pluto closer to the sun than saturn'
|
|
103
|
-
def parse_yes_no_question(question)
|
|
104
|
-
words = question.split ' '
|
|
105
|
-
return words[1..-1].join ' '
|
|
106
|
-
end
|
|
107
|
-
|
|
108
|
-
# Returns `question` without the declarative statement
|
|
109
|
-
# Example:
|
|
110
|
-
# question: 'tell me what is Pluto'
|
|
111
|
-
# returns : 'what is Pluto'
|
|
112
|
-
def parse_declarative_question(question)
|
|
113
|
-
declarative_expressions = [ 'tell me', 'I want to know' ]
|
|
114
|
-
return question.gsub(/^#{Regexp.union(*declarative_expressions)}/, '').strip
|
|
115
|
-
end
|
|
116
|
-
|
|
117
|
-
# === DETECT TYPE OF QUESTION ===
|
|
118
|
-
|
|
119
|
-
def broad_question_type(question)
|
|
120
|
-
return 'wh' if is_wh_question question
|
|
121
|
-
return 'yes-no' if is_yes_no_question question
|
|
122
|
-
return 'declarative'
|
|
123
|
-
end
|
|
124
|
-
|
|
125
|
-
# Returns true if question starts with a wh-question word
|
|
126
|
-
def is_wh_question(question)
|
|
127
|
-
wh_words = %w(who where when why what which how)
|
|
128
|
-
return /^#{Regexp.union(*wh_words)}/ === question
|
|
129
|
-
end
|
|
130
|
-
|
|
131
|
-
# Returns true if question starts with a yes-no question expression
|
|
132
|
-
def is_yes_no_question(question)
|
|
133
|
-
yes_no_words = %w(am are is was were have has do does did can could should may)
|
|
134
|
-
return /^#{Regexp.union(*yes_no_words)}/ === question
|
|
135
|
-
end
|
|
136
|
-
|
|
137
|
-
# === PREPROCESSING ===
|
|
138
|
-
|
|
139
|
-
# Returns cleaned `input`
|
|
140
|
-
def preprocess(input)
|
|
141
|
-
clean(input)
|
|
142
|
-
end
|
|
143
|
-
|
|
144
|
-
# Cleans the string `input` by removing non alpha-numeric characters
|
|
145
|
-
def clean(input)
|
|
146
|
-
ret = input.downcase
|
|
147
|
-
ret.gsub(/[^0-9a-z ]/i, '').strip
|
|
148
|
-
end
|
|
149
|
-
|
|
150
|
-
# === OTHER FORMATTING ===
|
|
151
|
-
|
|
152
|
-
def clean_google_result(string)
|
|
153
|
-
string
|
|
154
|
-
.downcase
|
|
155
|
-
.gsub(/[^\.]+\.{3,}/, '') # remove incomplete sentences
|
|
156
|
-
.gsub(/<("[^"]*"|'[^']*'|[^'">])*>/, '') # html tags
|
|
157
|
-
.gsub(/\w{3} \d{1,2}, \d{4} \.{3} /, '') # dates (27 Jan, 2015)
|
|
158
|
-
.gsub("\n",'') # new lines
|
|
159
|
-
end
|
|
160
|
-
|
|
161
|
-
def split_at_dot(string)
|
|
162
|
-
re = /([a-z]{2})[\.\?!] ?/i # regex to match *aa. where a is any letter
|
|
163
|
-
string.split(re).each_slice(2).map(&:join)
|
|
164
|
-
end
|
|
165
|
-
end
|
|
166
6
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: answerific
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Justin Domingue
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2015-04-
|
|
11
|
+
date: 2015-04-30 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -125,6 +125,7 @@ files:
|
|
|
125
125
|
- bin/console
|
|
126
126
|
- bin/setup
|
|
127
127
|
- lib/answerific.rb
|
|
128
|
+
- lib/answerific/miner.rb
|
|
128
129
|
- lib/answerific/version.rb
|
|
129
130
|
homepage: https://github.com/justindomingue/answerific
|
|
130
131
|
licenses: []
|