google-local-results-ai-parser 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/google-local-results-ai-parser.rb +391 -0
- metadata +45 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: ee56950fd52aa6f87345bd757703be624ef3c24fd64b56e7bc62940d25af249d
|
4
|
+
data.tar.gz: 589f573aaf3f77042e97ab6271ba8ff8a29a0329fe5caa669aee49a4198df9c1
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 78479818c57185517acba5e74130c58a0ae0274d6612bb154043e4e9b2a5912a984ce47c10c8fdf10963f3df543db8142d94f1172c92ad58fc848dd2e29a661a
|
7
|
+
data.tar.gz: a9ad40b52b9d12786da0d8e4c4d78186b05e9000ce51e9a09968a36dd02970b18fd5d554543b3a22e00cefccba85826df1240fd42c3304985f9e0c17ebac7470
|
@@ -0,0 +1,391 @@
|
|
1
|
+
require 'nokolexbor'
|
2
|
+
require 'http'
|
3
|
+
require 'parallel'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
module GoogleLocalResultsAiParser
|
7
|
+
DEFAULT_SERVER = 'https://api-inference.huggingface.co/models/serpapi/bert-base-local-results'.freeze
|
8
|
+
DEFAULT_SEPARATOR_REGEX = /\n|·|⋅/.freeze
|
9
|
+
DEFAULT_REJECTED_CSS = "[role='heading'], a[ping], [class*='label']".freeze
|
10
|
+
DEFAULT_BROKEN_CSS = "b:has(::text)".freeze
|
11
|
+
DEFAULT_MAX_ITERATION = 1
|
12
|
+
|
13
|
+
class CustomError < StandardError
|
14
|
+
attr_reader :message
|
15
|
+
|
16
|
+
def initialize(message = "\nThere is a problem with the connection to the server. Try setting up a private server or configure your server credentials.\nIf you are using the public endpoint, you may wait for the model to load.")
|
17
|
+
@message = message
|
18
|
+
super
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_s
|
22
|
+
"#{self.class}: #{message}"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
class << self
|
27
|
+
def parse_multiple(html_parts: nil, bearer_token: nil, server: DEFAULT_SERVER, separator_regex: DEFAULT_SEPARATOR_REGEX, rejected_css: DEFAULT_REJECTED_CSS, broken_css: DEFAULT_BROKEN_CSS, iteration: DEFAULT_MAX_ITERATION)
|
28
|
+
response_bodies = Parallel.map(html_parts, in_threads: html_parts.size) do |html|
|
29
|
+
parse(html: html, bearer_token: bearer_token, server: server, separator_regex: separator_regex, rejected_css: rejected_css, broken_css: DEFAULT_BROKEN_CSS, iteration: DEFAULT_MAX_ITERATION)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def parse(html: nil, bearer_token: nil, server: DEFAULT_SERVER, separator_regex: DEFAULT_SEPARATOR_REGEX, rejected_css: DEFAULT_REJECTED_CSS, broken_css: DEFAULT_BROKEN_CSS, iteration: DEFAULT_MAX_ITERATION)
|
34
|
+
doc = Nokolexbor::HTML(html)
|
35
|
+
|
36
|
+
# Rejecting title, buttons, and label
|
37
|
+
doc.css(rejected_css).remove
|
38
|
+
|
39
|
+
# Breaking down bold text to reduce noise
|
40
|
+
doc.css(DEFAULT_BROKEN_CSS).each { |b| b.parent.replace(Nokolexbor::Text.new(b.parent.text, doc)) }
|
41
|
+
|
42
|
+
# Separating and cleaning the text
|
43
|
+
unsplit_text = doc.at_css('html').text
|
44
|
+
extracted_text = doc.css("::text").map {|part| part.text.strip}.compact.join("\n")
|
45
|
+
split_text = extracted_text.split(separator_regex)
|
46
|
+
cleaned_text = split_text.map(&:strip).reject(&:empty?).flatten
|
47
|
+
|
48
|
+
# Making parallel requests to server for classification
|
49
|
+
results = parallel_post_requests(server, bearer_token, cleaned_text)
|
50
|
+
|
51
|
+
# After-fix and sorting of results
|
52
|
+
results = sort_results(results, extracted_text, unsplit_text, iteration, doc)
|
53
|
+
final_results = transform_hash(results, unsplit_text)
|
54
|
+
final_results
|
55
|
+
end
|
56
|
+
|
57
|
+
def transform_hash(results, unsplit_text)
|
58
|
+
# Transforming the final results into a hash with classifications
|
59
|
+
final_results = {}
|
60
|
+
results.each do |result|
|
61
|
+
label = result[:result][0][0]["label"]
|
62
|
+
value = result[:input]
|
63
|
+
|
64
|
+
if final_results[label]
|
65
|
+
# Combine the text for same elements
|
66
|
+
final_results[label] = unsplit_text[/#{final_results[label]}.+#{value}/]
|
67
|
+
else
|
68
|
+
# Directly assign values
|
69
|
+
final_results[label] = value
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
final_results
|
74
|
+
end
|
75
|
+
|
76
|
+
def sort_results(results, extracted_text, unsplit_text, iteration, doc)
|
77
|
+
# Make at most 2 iterations for after-corrections
|
78
|
+
(0..iteration).each do |i|
|
79
|
+
begin
|
80
|
+
# Check if some results contain clashes, or need to be merged
|
81
|
+
label_order = results.map {|result| result[:result][0][0]["label"]}
|
82
|
+
rescue
|
83
|
+
raise CustomError
|
84
|
+
end
|
85
|
+
|
86
|
+
# Safety measures
|
87
|
+
results, label_order = check_if_button_text(results, label_order, doc)
|
88
|
+
|
89
|
+
# Find duplicates
|
90
|
+
duplicates = find_duplicates(label_order)
|
91
|
+
|
92
|
+
# Known clashes
|
93
|
+
results, label_order, duplicates = service_options_as_type_confusion(results, label_order, duplicates)
|
94
|
+
results, label_order, duplicates = description_as_hours_confusion(results, label_order, duplicates)
|
95
|
+
results, label_order, duplicates = description_as_type_confusion(results, label_order, duplicates)
|
96
|
+
results, label_order, duplicates = reviews_as_rating_confusion(results, label_order, duplicates)
|
97
|
+
results, label_order, duplicates = button_text_as_hours_confusion(results, label_order, duplicates)
|
98
|
+
|
99
|
+
# General clashes
|
100
|
+
line_result = check_if_on_different_lines(results, duplicates, unsplit_text)
|
101
|
+
duplicates.each_with_index do |duplicate, duplicate_index|
|
102
|
+
if line_result[duplicate_index] != []
|
103
|
+
# General clash
|
104
|
+
line_result[duplicate_index].each do |clash|
|
105
|
+
first_result_score = results[clash[0]][:result][0][0]["score"]
|
106
|
+
second_result_score = results[clash[1]][:result][0][0]["score"]
|
107
|
+
|
108
|
+
if first_result_score > second_result_score
|
109
|
+
clash_index = clash[1]
|
110
|
+
else
|
111
|
+
clash_index = clash[0]
|
112
|
+
end
|
113
|
+
|
114
|
+
# Zero out the false classification, and put it to last position
|
115
|
+
primary_prediction = results[clash_index][:result][0][0]
|
116
|
+
primary_prediction["score"] = 0.0
|
117
|
+
second_prediction = results[clash_index][:result][0][1]
|
118
|
+
results[clash_index][:result][0][0] = second_prediction
|
119
|
+
results[clash_index][:result][0].delete_at(1)
|
120
|
+
results[clash_index][:result][0] << primary_prediction
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
# Check one more time to see if there's any clashes left
|
126
|
+
label_order = results.map {|result| result[:result][0][0]["label"]}
|
127
|
+
duplicates = find_duplicates(label_order)
|
128
|
+
line_result = check_if_on_different_lines(results, duplicates, unsplit_text)
|
129
|
+
no_clashes = line_result.all? { |sub_array| sub_array.empty? }
|
130
|
+
|
131
|
+
if no_clashes
|
132
|
+
break
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
results
|
137
|
+
end
|
138
|
+
|
139
|
+
# Items on different lines will be combined in `unsplit_text`.
|
140
|
+
# We can make combinations of 2 to eliminate the bad weed.
|
141
|
+
def check_if_on_different_lines(results, duplicates, unsplit_text)
|
142
|
+
line_result = []
|
143
|
+
duplicates.each do |duplicate|
|
144
|
+
combinations = duplicate.each_cons(2).to_a
|
145
|
+
|
146
|
+
sub_result = []
|
147
|
+
|
148
|
+
combinations.each do |combination|
|
149
|
+
combined_text = combination.map {|index| "#{results[index][:input]}"}.join
|
150
|
+
sub_result << combination if unsplit_text.include?(combined_text)
|
151
|
+
end
|
152
|
+
|
153
|
+
line_result << sub_result
|
154
|
+
end
|
155
|
+
|
156
|
+
line_result
|
157
|
+
end
|
158
|
+
|
159
|
+
# Find duplicate labels and group them
|
160
|
+
def find_duplicates(label_order)
|
161
|
+
indices = []
|
162
|
+
label_order.each_with_index do |label, index|
|
163
|
+
common_indices = label_order.map.with_index do |compared_label, compared_index|
|
164
|
+
if compared_label == label && compared_index != index && !indices.flatten.include?(index)
|
165
|
+
compared_index
|
166
|
+
end
|
167
|
+
end.compact
|
168
|
+
|
169
|
+
if common_indices != []
|
170
|
+
indices << [index, common_indices].flatten
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
indices
|
175
|
+
end
|
176
|
+
|
177
|
+
# Double checking residue button text
|
178
|
+
# The model hasn't encountered this behaviour.
|
179
|
+
# This is a safety measure.
|
180
|
+
def check_if_button_text(results, label_order, doc)
|
181
|
+
return results, label_order unless label_order.include?("button text")
|
182
|
+
|
183
|
+
button_indices = label_order.map.with_index {|label, index| index if label == "button text"}.compact
|
184
|
+
button_results = []
|
185
|
+
|
186
|
+
button_indices.each do |button_index|
|
187
|
+
button_result = results[button_index]
|
188
|
+
button_text = results[button_index][:input]
|
189
|
+
has_button_text = doc.css("[href], [ping]").any? {|element| element.text.include?(button_text)}
|
190
|
+
|
191
|
+
if has_button_text
|
192
|
+
# If it is really a button text inside a link
|
193
|
+
button_results << button_result
|
194
|
+
else
|
195
|
+
# Zero out the `button text`, and put it to last position
|
196
|
+
results[button_index][:result][0][0] = results[button_index][:result][0][1]
|
197
|
+
results[button_index][:result][0].delete_at(1)
|
198
|
+
button_result[:result][0][0]["score"] = 0.0
|
199
|
+
results[button_index][:result][0] << button_result[:result][0][0]
|
200
|
+
label_order[button_index] = results[button_index][:result][0][0]["label"]
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
# Clear the buttons
|
205
|
+
button_results.each do |button_result|
|
206
|
+
results.delete(button_result)
|
207
|
+
end
|
208
|
+
|
209
|
+
# Clear the labels
|
210
|
+
label_order.delete_if {|label| label == "button text"}
|
211
|
+
|
212
|
+
return results, label_order
|
213
|
+
end
|
214
|
+
|
215
|
+
def button_text_as_hours_confusion(results, label_order, duplicates)
|
216
|
+
known_errors = ["Expand more"]
|
217
|
+
confusion_condition = results.any? {|result| known_errors.include?(result[:input])}
|
218
|
+
return results, label_order, duplicates unless confusion_condition
|
219
|
+
|
220
|
+
hours_duplicate = duplicates.find.with_index do |duplicate, duplicate_index|
|
221
|
+
if results[duplicate[0]][:result][0][0]["label"] == "hours"
|
222
|
+
duplicate_index
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
# Delete the known button text directly
|
227
|
+
results.delete_at(hours_duplicate[-1])
|
228
|
+
|
229
|
+
# Rearranging `label_order`
|
230
|
+
label_order.delete_at(hours_duplicate[-1])
|
231
|
+
|
232
|
+
# Rearranging duplicates
|
233
|
+
last_item = duplicates[duplicates.index(hours_duplicate)][-1]
|
234
|
+
duplicates[duplicates.index(hours_duplicate)].delete(last_item)
|
235
|
+
|
236
|
+
if (duplicate_arr = duplicates[duplicates.index(hours_duplicate)]) && duplicate_arr.size == 1
|
237
|
+
duplicates.delete(duplicate_arr)
|
238
|
+
end
|
239
|
+
|
240
|
+
return results, label_order, duplicates
|
241
|
+
end
|
242
|
+
|
243
|
+
# 3.4 .. (1.4K)
|
244
|
+
# Fixes `(1.4K)`
|
245
|
+
def reviews_as_rating_confusion(results, label_order, duplicates)
|
246
|
+
rating_duplicate = duplicates.find.with_index do |duplicate, duplicate_index|
|
247
|
+
if results[duplicate[0]][:result][0][0]["label"] == "rating"
|
248
|
+
duplicate_index
|
249
|
+
end
|
250
|
+
end
|
251
|
+
|
252
|
+
if rating_duplicate && results[rating_duplicate[-1]][:input][/\(\d+\.\d+\w\)/]
|
253
|
+
# Zero out the `rating`, and put it to last position
|
254
|
+
reviews_hash = results[rating_duplicate[-1]][:result][0].find {|hash| hash["label"] == "reviews" }
|
255
|
+
reviews_index = results[rating_duplicate[-1]][:result][0].index(reviews_hash)
|
256
|
+
results[rating_duplicate[-1]][:result][0][0] = {"label" => "reviews", "score" => 1.0}
|
257
|
+
results[rating_duplicate[-1]][:result][0].delete_at(reviews_index)
|
258
|
+
results[rating_duplicate[-1]][:result][0] << {"label" => "rating", "score" => 0.0}
|
259
|
+
|
260
|
+
# Rearranging `label_order`
|
261
|
+
label_order[rating_duplicate[-1]] = "reviews"
|
262
|
+
|
263
|
+
# Rearranging duplicates
|
264
|
+
last_item = duplicates[duplicates.index(rating_duplicate)][-1]
|
265
|
+
duplicates[duplicates.index(rating_duplicate)].delete(last_item)
|
266
|
+
|
267
|
+
if (duplicate_arr = duplicates[duplicates.index(rating_duplicate)]) && duplicate_arr.size == 1
|
268
|
+
duplicates.delete(duplicate_arr)
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
272
|
+
return results, label_order, duplicates
|
273
|
+
end
|
274
|
+
|
275
|
+
# Coffee shop ... Iconic Seattle-based coffeehouse chain
|
276
|
+
# Fixes `Iconic Seattle-based coffeehouse chain`
|
277
|
+
def description_as_type_confusion(results, label_order, duplicates)
|
278
|
+
return results, label_order, duplicates if label_order[-1] != "type"
|
279
|
+
|
280
|
+
# Zero out the `type`, and put it to last position
|
281
|
+
description_hash = results[-1][:result][0].find {|hash| hash["label"] == "description" }
|
282
|
+
description_index = results[-1][:result][0].index(description_hash)
|
283
|
+
results[-1][:result][0][0] = {"label" => "description", "score" => 1.0}
|
284
|
+
results[-1][:result][0].delete_at(description_index)
|
285
|
+
results[-1][:result][0] << {"label" => "type", "score" => 0.0}
|
286
|
+
|
287
|
+
# Rearranging `label_order`
|
288
|
+
label_order[-1] = "description"
|
289
|
+
|
290
|
+
# Rearranging duplicates if there's any duplication
|
291
|
+
if duplicates.flatten.include?(label_order.size - 1)
|
292
|
+
type_duplicate = duplicates.find {|duplicate| duplicate.include?(label_order.size - 1)}
|
293
|
+
last_item = duplicates[duplicates.index(type_duplicate)][-1]
|
294
|
+
duplicates[duplicates.index(type_duplicate)].delete(last_item)
|
295
|
+
|
296
|
+
if (duplicate_arr = duplicates[duplicates.index(type_duplicate)]) && duplicate_arr.size == 1
|
297
|
+
duplicates.delete(duplicate_arr)
|
298
|
+
end
|
299
|
+
end
|
300
|
+
|
301
|
+
return results, label_order, duplicates
|
302
|
+
end
|
303
|
+
|
304
|
+
# Drive through: Open ⋅ Closes 12 AM
|
305
|
+
# Fixes `Closes 12 AM``
|
306
|
+
def description_as_hours_confusion(results, label_order, duplicates)
|
307
|
+
description_index = label_order.index("description")
|
308
|
+
hours_index = label_order.index("hours")
|
309
|
+
|
310
|
+
# Description may or may not be a duplicate.
|
311
|
+
# This is a known error from the model, so it has to be handled in any case.
|
312
|
+
if description_index && hours_index && description_index + 1 == hours_index
|
313
|
+
# Zero out the `hours`, and put it to last position
|
314
|
+
description_hash = results[hours_index][:result][0].find {|hash| hash["label"] == "description" }
|
315
|
+
description_index = results[hours_index][:result][0].index(description_hash)
|
316
|
+
results[hours_index][:result][0][0] = {"label" => "description", "score" => 1.0}
|
317
|
+
results[hours_index][:result][0].delete_at(description_index)
|
318
|
+
results[hours_index][:result][0] << {"label" => "hours", "score" => 0.0}
|
319
|
+
|
320
|
+
# Rearranging `label_order`
|
321
|
+
label_order[hours_index] = "description"
|
322
|
+
|
323
|
+
# Rearranging duplicates if there's any duplication
|
324
|
+
if duplicates.flatten.include?(hours_index)
|
325
|
+
hours_duplicate = duplicates.find {|duplicate| duplicate.include?(hours_index)}
|
326
|
+
last_item = duplicates[duplicates.index(hours_duplicate)][-1]
|
327
|
+
duplicates[duplicates.index(hours_duplicate)].delete(last_item)
|
328
|
+
|
329
|
+
if (duplicate_arr = duplicates[duplicates.index(hours_duplicate)]) && duplicate_arr.size == 1
|
330
|
+
duplicates.delete(duplicate_arr)
|
331
|
+
end
|
332
|
+
end
|
333
|
+
end
|
334
|
+
|
335
|
+
return results, label_order, duplicates
|
336
|
+
end
|
337
|
+
|
338
|
+
# Takeaway ⋅ Dine-in ...
|
339
|
+
# Fixes `Takeaway`
|
340
|
+
def service_options_as_type_confusion(results, label_order, duplicates)
|
341
|
+
type_duplicate = duplicates.find.with_index do |duplicate, duplicate_index|
|
342
|
+
if results[duplicate[0]][:result][0][0]["label"] == "type"
|
343
|
+
duplicate_index
|
344
|
+
end
|
345
|
+
end
|
346
|
+
|
347
|
+
if type_duplicate && (adjacent_item = results[type_duplicate[-1] + 1]) && adjacent_item[:result][0][0]["label"] == "service options"
|
348
|
+
# Zero out the `type`, and put it to last position
|
349
|
+
service_options_hash = results[type_duplicate[-1]][:result][0].find {|hash| hash["label"] == "service options" }
|
350
|
+
service_options_index = results[type_duplicate[-1]][:result][0].index(service_options_hash)
|
351
|
+
results[type_duplicate[-1]][:result][0][0] = {"label" => "service options", "score" => 1.0}
|
352
|
+
results[type_duplicate[-1]][:result][0].delete_at(service_options_index)
|
353
|
+
results[type_duplicate[-1]][:result][0] << {"label" => "type", "score" => 0.0}
|
354
|
+
|
355
|
+
# Rearranging `label_order`
|
356
|
+
label_order[type_duplicate[-1]] = "service_options"
|
357
|
+
|
358
|
+
# Rearranging duplicates
|
359
|
+
last_item = duplicates[duplicates.index(type_duplicate)][-1]
|
360
|
+
duplicates[duplicates.index(type_duplicate)].delete(last_item)
|
361
|
+
|
362
|
+
if (duplicate_arr = duplicates[duplicates.index(type_duplicate)]) && duplicate_arr.size == 1
|
363
|
+
duplicates.delete(duplicate_arr)
|
364
|
+
end
|
365
|
+
end
|
366
|
+
|
367
|
+
return results, label_order, duplicates
|
368
|
+
end
|
369
|
+
|
370
|
+
private
|
371
|
+
|
372
|
+
def parallel_post_requests(server, bearer_token, inputs)
|
373
|
+
response_bodies = Parallel.map(inputs, in_threads: inputs.size) do |input|
|
374
|
+
post_request(server, bearer_token, input)
|
375
|
+
end
|
376
|
+
|
377
|
+
response_bodies
|
378
|
+
end
|
379
|
+
|
380
|
+
def post_request(server, bearer_token, input)
|
381
|
+
url = URI.parse(server)
|
382
|
+
headers = { 'Authorization' => "Bearer #{bearer_token}", 'Content-Type' => 'application/json' }
|
383
|
+
body = { inputs: input }.to_json
|
384
|
+
|
385
|
+
response = HTTP.headers(headers).post(url, body: body)
|
386
|
+
response_body = JSON.parse(response.body)
|
387
|
+
|
388
|
+
{ input: input, result: response_body }
|
389
|
+
end
|
390
|
+
end
|
391
|
+
end
|
metadata
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: google-local-results-ai-parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Emirhan Akdeniz
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2023-06-13 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: A gem to be used with serpapi/bert-base-local-results model to predict
|
14
|
+
different parts of Google Local Listings.
|
15
|
+
email: kagermanovtalks@gmail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- lib/google-local-results-ai-parser.rb
|
21
|
+
homepage: https://github.com/serpapi/google-local-results-ai-parser
|
22
|
+
licenses:
|
23
|
+
- MIT
|
24
|
+
metadata: {}
|
25
|
+
post_install_message:
|
26
|
+
rdoc_options: []
|
27
|
+
require_paths:
|
28
|
+
- lib
|
29
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
requirements: []
|
40
|
+
rubygems_version: 3.1.4
|
41
|
+
signing_key:
|
42
|
+
specification_version: 4
|
43
|
+
summary: A gem to be used with serpapi/bert-base-local-results model to predict different
|
44
|
+
parts of Google Local Listings.
|
45
|
+
test_files: []
|