google-local-results-ai-parser 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/google-local-results-ai-parser.rb +391 -0
- metadata +45 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: ee56950fd52aa6f87345bd757703be624ef3c24fd64b56e7bc62940d25af249d
|
4
|
+
data.tar.gz: 589f573aaf3f77042e97ab6271ba8ff8a29a0329fe5caa669aee49a4198df9c1
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 78479818c57185517acba5e74130c58a0ae0274d6612bb154043e4e9b2a5912a984ce47c10c8fdf10963f3df543db8142d94f1172c92ad58fc848dd2e29a661a
|
7
|
+
data.tar.gz: a9ad40b52b9d12786da0d8e4c4d78186b05e9000ce51e9a09968a36dd02970b18fd5d554543b3a22e00cefccba85826df1240fd42c3304985f9e0c17ebac7470
|
@@ -0,0 +1,391 @@
|
|
1
|
+
require 'nokolexbor'
|
2
|
+
require 'http'
|
3
|
+
require 'parallel'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
module GoogleLocalResultsAiParser
|
7
|
+
DEFAULT_SERVER = 'https://api-inference.huggingface.co/models/serpapi/bert-base-local-results'.freeze
|
8
|
+
DEFAULT_SEPARATOR_REGEX = /\n|·|⋅/.freeze
|
9
|
+
DEFAULT_REJECTED_CSS = "[role='heading'], a[ping], [class*='label']".freeze
|
10
|
+
DEFAULT_BROKEN_CSS = "b:has(::text)".freeze
|
11
|
+
DEFAULT_MAX_ITERATION = 1
|
12
|
+
|
13
|
+
class CustomError < StandardError
|
14
|
+
attr_reader :message
|
15
|
+
|
16
|
+
def initialize(message = "\nThere is a problem with the connection to the server. Try setting up a private server or configure your server credentials.\nIf you are using the public endpoint, you may wait for the model to load.")
|
17
|
+
@message = message
|
18
|
+
super
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_s
|
22
|
+
"#{self.class}: #{message}"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
class << self
|
27
|
+
def parse_multiple(html_parts: nil, bearer_token: nil, server: DEFAULT_SERVER, separator_regex: DEFAULT_SEPARATOR_REGEX, rejected_css: DEFAULT_REJECTED_CSS, broken_css: DEFAULT_BROKEN_CSS, iteration: DEFAULT_MAX_ITERATION)
|
28
|
+
response_bodies = Parallel.map(html_parts, in_threads: html_parts.size) do |html|
|
29
|
+
parse(html: html, bearer_token: bearer_token, server: server, separator_regex: separator_regex, rejected_css: rejected_css, broken_css: DEFAULT_BROKEN_CSS, iteration: DEFAULT_MAX_ITERATION)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def parse(html: nil, bearer_token: nil, server: DEFAULT_SERVER, separator_regex: DEFAULT_SEPARATOR_REGEX, rejected_css: DEFAULT_REJECTED_CSS, broken_css: DEFAULT_BROKEN_CSS, iteration: DEFAULT_MAX_ITERATION)
|
34
|
+
doc = Nokolexbor::HTML(html)
|
35
|
+
|
36
|
+
# Rejecting title, buttons, and label
|
37
|
+
doc.css(rejected_css).remove
|
38
|
+
|
39
|
+
# Breaking down bold text to reduce noise
|
40
|
+
doc.css(DEFAULT_BROKEN_CSS).each { |b| b.parent.replace(Nokolexbor::Text.new(b.parent.text, doc)) }
|
41
|
+
|
42
|
+
# Separating and cleaning the text
|
43
|
+
unsplit_text = doc.at_css('html').text
|
44
|
+
extracted_text = doc.css("::text").map {|part| part.text.strip}.compact.join("\n")
|
45
|
+
split_text = extracted_text.split(separator_regex)
|
46
|
+
cleaned_text = split_text.map(&:strip).reject(&:empty?).flatten
|
47
|
+
|
48
|
+
# Making parallel requests to server for classification
|
49
|
+
results = parallel_post_requests(server, bearer_token, cleaned_text)
|
50
|
+
|
51
|
+
# After-fix and sorting of results
|
52
|
+
results = sort_results(results, extracted_text, unsplit_text, iteration, doc)
|
53
|
+
final_results = transform_hash(results, unsplit_text)
|
54
|
+
final_results
|
55
|
+
end
|
56
|
+
|
57
|
+
def transform_hash(results, unsplit_text)
|
58
|
+
# Transforming the final results into a hash with classifications
|
59
|
+
final_results = {}
|
60
|
+
results.each do |result|
|
61
|
+
label = result[:result][0][0]["label"]
|
62
|
+
value = result[:input]
|
63
|
+
|
64
|
+
if final_results[label]
|
65
|
+
# Combine the text for same elements
|
66
|
+
final_results[label] = unsplit_text[/#{final_results[label]}.+#{value}/]
|
67
|
+
else
|
68
|
+
# Directly assign values
|
69
|
+
final_results[label] = value
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
final_results
|
74
|
+
end
|
75
|
+
|
76
|
+
def sort_results(results, extracted_text, unsplit_text, iteration, doc)
|
77
|
+
# Make at most 2 iterations for after-corrections
|
78
|
+
(0..iteration).each do |i|
|
79
|
+
begin
|
80
|
+
# Check if some results contain clashes, or need to be merged
|
81
|
+
label_order = results.map {|result| result[:result][0][0]["label"]}
|
82
|
+
rescue
|
83
|
+
raise CustomError
|
84
|
+
end
|
85
|
+
|
86
|
+
# Safety measures
|
87
|
+
results, label_order = check_if_button_text(results, label_order, doc)
|
88
|
+
|
89
|
+
# Find duplicates
|
90
|
+
duplicates = find_duplicates(label_order)
|
91
|
+
|
92
|
+
# Known clashes
|
93
|
+
results, label_order, duplicates = service_options_as_type_confusion(results, label_order, duplicates)
|
94
|
+
results, label_order, duplicates = description_as_hours_confusion(results, label_order, duplicates)
|
95
|
+
results, label_order, duplicates = description_as_type_confusion(results, label_order, duplicates)
|
96
|
+
results, label_order, duplicates = reviews_as_rating_confusion(results, label_order, duplicates)
|
97
|
+
results, label_order, duplicates = button_text_as_hours_confusion(results, label_order, duplicates)
|
98
|
+
|
99
|
+
# General clashes
|
100
|
+
line_result = check_if_on_different_lines(results, duplicates, unsplit_text)
|
101
|
+
duplicates.each_with_index do |duplicate, duplicate_index|
|
102
|
+
if line_result[duplicate_index] != []
|
103
|
+
# General clash
|
104
|
+
line_result[duplicate_index].each do |clash|
|
105
|
+
first_result_score = results[clash[0]][:result][0][0]["score"]
|
106
|
+
second_result_score = results[clash[1]][:result][0][0]["score"]
|
107
|
+
|
108
|
+
if first_result_score > second_result_score
|
109
|
+
clash_index = clash[1]
|
110
|
+
else
|
111
|
+
clash_index = clash[0]
|
112
|
+
end
|
113
|
+
|
114
|
+
# Zero out the false classification, and put it to last position
|
115
|
+
primary_prediction = results[clash_index][:result][0][0]
|
116
|
+
primary_prediction["score"] = 0.0
|
117
|
+
second_prediction = results[clash_index][:result][0][1]
|
118
|
+
results[clash_index][:result][0][0] = second_prediction
|
119
|
+
results[clash_index][:result][0].delete_at(1)
|
120
|
+
results[clash_index][:result][0] << primary_prediction
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
# Check one more time to see if there's any clashes left
|
126
|
+
label_order = results.map {|result| result[:result][0][0]["label"]}
|
127
|
+
duplicates = find_duplicates(label_order)
|
128
|
+
line_result = check_if_on_different_lines(results, duplicates, unsplit_text)
|
129
|
+
no_clashes = line_result.all? { |sub_array| sub_array.empty? }
|
130
|
+
|
131
|
+
if no_clashes
|
132
|
+
break
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
results
|
137
|
+
end
|
138
|
+
|
139
|
+
# Items on different lines will be combined in `unsplit_text`.
|
140
|
+
# We can make combinations of 2 to eliminate the bad weed.
|
141
|
+
def check_if_on_different_lines(results, duplicates, unsplit_text)
|
142
|
+
line_result = []
|
143
|
+
duplicates.each do |duplicate|
|
144
|
+
combinations = duplicate.each_cons(2).to_a
|
145
|
+
|
146
|
+
sub_result = []
|
147
|
+
|
148
|
+
combinations.each do |combination|
|
149
|
+
combined_text = combination.map {|index| "#{results[index][:input]}"}.join
|
150
|
+
sub_result << combination if unsplit_text.include?(combined_text)
|
151
|
+
end
|
152
|
+
|
153
|
+
line_result << sub_result
|
154
|
+
end
|
155
|
+
|
156
|
+
line_result
|
157
|
+
end
|
158
|
+
|
159
|
+
# Find duplicate labels and group them
|
160
|
+
def find_duplicates(label_order)
|
161
|
+
indices = []
|
162
|
+
label_order.each_with_index do |label, index|
|
163
|
+
common_indices = label_order.map.with_index do |compared_label, compared_index|
|
164
|
+
if compared_label == label && compared_index != index && !indices.flatten.include?(index)
|
165
|
+
compared_index
|
166
|
+
end
|
167
|
+
end.compact
|
168
|
+
|
169
|
+
if common_indices != []
|
170
|
+
indices << [index, common_indices].flatten
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
indices
|
175
|
+
end
|
176
|
+
|
177
|
+
# Double checking residue button text
|
178
|
+
# The model hasn't encountered this behaviour.
|
179
|
+
# This is a safety measure.
|
180
|
+
def check_if_button_text(results, label_order, doc)
|
181
|
+
return results, label_order unless label_order.include?("button text")
|
182
|
+
|
183
|
+
button_indices = label_order.map.with_index {|label, index| index if label == "button text"}.compact
|
184
|
+
button_results = []
|
185
|
+
|
186
|
+
button_indices.each do |button_index|
|
187
|
+
button_result = results[button_index]
|
188
|
+
button_text = results[button_index][:input]
|
189
|
+
has_button_text = doc.css("[href], [ping]").any? {|element| element.text.include?(button_text)}
|
190
|
+
|
191
|
+
if has_button_text
|
192
|
+
# If it is really a button text inside a link
|
193
|
+
button_results << button_result
|
194
|
+
else
|
195
|
+
# Zero out the `button text`, and put it to last position
|
196
|
+
results[button_index][:result][0][0] = results[button_index][:result][0][1]
|
197
|
+
results[button_index][:result][0].delete_at(1)
|
198
|
+
button_result[:result][0][0]["score"] = 0.0
|
199
|
+
results[button_index][:result][0] << button_result[:result][0][0]
|
200
|
+
label_order[button_index] = results[button_index][:result][0][0]["label"]
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
# Clear the buttons
|
205
|
+
button_results.each do |button_result|
|
206
|
+
results.delete(button_result)
|
207
|
+
end
|
208
|
+
|
209
|
+
# Clear the labels
|
210
|
+
label_order.delete_if {|label| label == "button text"}
|
211
|
+
|
212
|
+
return results, label_order
|
213
|
+
end
|
214
|
+
|
215
|
+
def button_text_as_hours_confusion(results, label_order, duplicates)
|
216
|
+
known_errors = ["Expand more"]
|
217
|
+
confusion_condition = results.any? {|result| known_errors.include?(result[:input])}
|
218
|
+
return results, label_order, duplicates unless confusion_condition
|
219
|
+
|
220
|
+
hours_duplicate = duplicates.find.with_index do |duplicate, duplicate_index|
|
221
|
+
if results[duplicate[0]][:result][0][0]["label"] == "hours"
|
222
|
+
duplicate_index
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
# Delete the known button text directly
|
227
|
+
results.delete_at(hours_duplicate[-1])
|
228
|
+
|
229
|
+
# Rearranging `label_order`
|
230
|
+
label_order.delete_at(hours_duplicate[-1])
|
231
|
+
|
232
|
+
# Rearranging duplicates
|
233
|
+
last_item = duplicates[duplicates.index(hours_duplicate)][-1]
|
234
|
+
duplicates[duplicates.index(hours_duplicate)].delete(last_item)
|
235
|
+
|
236
|
+
if (duplicate_arr = duplicates[duplicates.index(hours_duplicate)]) && duplicate_arr.size == 1
|
237
|
+
duplicates.delete(duplicate_arr)
|
238
|
+
end
|
239
|
+
|
240
|
+
return results, label_order, duplicates
|
241
|
+
end
|
242
|
+
|
243
|
+
# 3.4 .. (1.4K)
|
244
|
+
# Fixes `(1.4K)`
|
245
|
+
def reviews_as_rating_confusion(results, label_order, duplicates)
|
246
|
+
rating_duplicate = duplicates.find.with_index do |duplicate, duplicate_index|
|
247
|
+
if results[duplicate[0]][:result][0][0]["label"] == "rating"
|
248
|
+
duplicate_index
|
249
|
+
end
|
250
|
+
end
|
251
|
+
|
252
|
+
if rating_duplicate && results[rating_duplicate[-1]][:input][/\(\d+\.\d+\w\)/]
|
253
|
+
# Zero out the `rating`, and put it to last position
|
254
|
+
reviews_hash = results[rating_duplicate[-1]][:result][0].find {|hash| hash["label"] == "reviews" }
|
255
|
+
reviews_index = results[rating_duplicate[-1]][:result][0].index(reviews_hash)
|
256
|
+
results[rating_duplicate[-1]][:result][0][0] = {"label" => "reviews", "score" => 1.0}
|
257
|
+
results[rating_duplicate[-1]][:result][0].delete_at(reviews_index)
|
258
|
+
results[rating_duplicate[-1]][:result][0] << {"label" => "rating", "score" => 0.0}
|
259
|
+
|
260
|
+
# Rearranging `label_order`
|
261
|
+
label_order[rating_duplicate[-1]] = "reviews"
|
262
|
+
|
263
|
+
# Rearranging duplicates
|
264
|
+
last_item = duplicates[duplicates.index(rating_duplicate)][-1]
|
265
|
+
duplicates[duplicates.index(rating_duplicate)].delete(last_item)
|
266
|
+
|
267
|
+
if (duplicate_arr = duplicates[duplicates.index(rating_duplicate)]) && duplicate_arr.size == 1
|
268
|
+
duplicates.delete(duplicate_arr)
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
272
|
+
return results, label_order, duplicates
|
273
|
+
end
|
274
|
+
|
275
|
+
# Coffee shop ... Iconic Seattle-based coffeehouse chain
|
276
|
+
# Fixes `Iconic Seattle-based coffeehouse chain`
|
277
|
+
def description_as_type_confusion(results, label_order, duplicates)
|
278
|
+
return results, label_order, duplicates if label_order[-1] != "type"
|
279
|
+
|
280
|
+
# Zero out the `type`, and put it to last position
|
281
|
+
description_hash = results[-1][:result][0].find {|hash| hash["label"] == "description" }
|
282
|
+
description_index = results[-1][:result][0].index(description_hash)
|
283
|
+
results[-1][:result][0][0] = {"label" => "description", "score" => 1.0}
|
284
|
+
results[-1][:result][0].delete_at(description_index)
|
285
|
+
results[-1][:result][0] << {"label" => "type", "score" => 0.0}
|
286
|
+
|
287
|
+
# Rearranging `label_order`
|
288
|
+
label_order[-1] = "description"
|
289
|
+
|
290
|
+
# Rearranging duplicates if there's any duplication
|
291
|
+
if duplicates.flatten.include?(label_order.size - 1)
|
292
|
+
type_duplicate = duplicates.find {|duplicate| duplicate.include?(label_order.size - 1)}
|
293
|
+
last_item = duplicates[duplicates.index(type_duplicate)][-1]
|
294
|
+
duplicates[duplicates.index(type_duplicate)].delete(last_item)
|
295
|
+
|
296
|
+
if (duplicate_arr = duplicates[duplicates.index(type_duplicate)]) && duplicate_arr.size == 1
|
297
|
+
duplicates.delete(duplicate_arr)
|
298
|
+
end
|
299
|
+
end
|
300
|
+
|
301
|
+
return results, label_order, duplicates
|
302
|
+
end
|
303
|
+
|
304
|
+
# Drive through: Open ⋅ Closes 12 AM
|
305
|
+
# Fixes `Closes 12 AM``
|
306
|
+
def description_as_hours_confusion(results, label_order, duplicates)
|
307
|
+
description_index = label_order.index("description")
|
308
|
+
hours_index = label_order.index("hours")
|
309
|
+
|
310
|
+
# Description may or may not be a duplicate.
|
311
|
+
# This is a known error from the model, so it has to be handled in any case.
|
312
|
+
if description_index && hours_index && description_index + 1 == hours_index
|
313
|
+
# Zero out the `hours`, and put it to last position
|
314
|
+
description_hash = results[hours_index][:result][0].find {|hash| hash["label"] == "description" }
|
315
|
+
description_index = results[hours_index][:result][0].index(description_hash)
|
316
|
+
results[hours_index][:result][0][0] = {"label" => "description", "score" => 1.0}
|
317
|
+
results[hours_index][:result][0].delete_at(description_index)
|
318
|
+
results[hours_index][:result][0] << {"label" => "hours", "score" => 0.0}
|
319
|
+
|
320
|
+
# Rearranging `label_order`
|
321
|
+
label_order[hours_index] = "description"
|
322
|
+
|
323
|
+
# Rearranging duplicates if there's any duplication
|
324
|
+
if duplicates.flatten.include?(hours_index)
|
325
|
+
hours_duplicate = duplicates.find {|duplicate| duplicate.include?(hours_index)}
|
326
|
+
last_item = duplicates[duplicates.index(hours_duplicate)][-1]
|
327
|
+
duplicates[duplicates.index(hours_duplicate)].delete(last_item)
|
328
|
+
|
329
|
+
if (duplicate_arr = duplicates[duplicates.index(hours_duplicate)]) && duplicate_arr.size == 1
|
330
|
+
duplicates.delete(duplicate_arr)
|
331
|
+
end
|
332
|
+
end
|
333
|
+
end
|
334
|
+
|
335
|
+
return results, label_order, duplicates
|
336
|
+
end
|
337
|
+
|
338
|
+
# Takeaway ⋅ Dine-in ...
|
339
|
+
# Fixes `Takeaway`
|
340
|
+
def service_options_as_type_confusion(results, label_order, duplicates)
|
341
|
+
type_duplicate = duplicates.find.with_index do |duplicate, duplicate_index|
|
342
|
+
if results[duplicate[0]][:result][0][0]["label"] == "type"
|
343
|
+
duplicate_index
|
344
|
+
end
|
345
|
+
end
|
346
|
+
|
347
|
+
if type_duplicate && (adjacent_item = results[type_duplicate[-1] + 1]) && adjacent_item[:result][0][0]["label"] == "service options"
|
348
|
+
# Zero out the `type`, and put it to last position
|
349
|
+
service_options_hash = results[type_duplicate[-1]][:result][0].find {|hash| hash["label"] == "service options" }
|
350
|
+
service_options_index = results[type_duplicate[-1]][:result][0].index(service_options_hash)
|
351
|
+
results[type_duplicate[-1]][:result][0][0] = {"label" => "service options", "score" => 1.0}
|
352
|
+
results[type_duplicate[-1]][:result][0].delete_at(service_options_index)
|
353
|
+
results[type_duplicate[-1]][:result][0] << {"label" => "type", "score" => 0.0}
|
354
|
+
|
355
|
+
# Rearranging `label_order`
|
356
|
+
label_order[type_duplicate[-1]] = "service_options"
|
357
|
+
|
358
|
+
# Rearranging duplicates
|
359
|
+
last_item = duplicates[duplicates.index(type_duplicate)][-1]
|
360
|
+
duplicates[duplicates.index(type_duplicate)].delete(last_item)
|
361
|
+
|
362
|
+
if (duplicate_arr = duplicates[duplicates.index(type_duplicate)]) && duplicate_arr.size == 1
|
363
|
+
duplicates.delete(duplicate_arr)
|
364
|
+
end
|
365
|
+
end
|
366
|
+
|
367
|
+
return results, label_order, duplicates
|
368
|
+
end
|
369
|
+
|
370
|
+
private
|
371
|
+
|
372
|
+
def parallel_post_requests(server, bearer_token, inputs)
|
373
|
+
response_bodies = Parallel.map(inputs, in_threads: inputs.size) do |input|
|
374
|
+
post_request(server, bearer_token, input)
|
375
|
+
end
|
376
|
+
|
377
|
+
response_bodies
|
378
|
+
end
|
379
|
+
|
380
|
+
def post_request(server, bearer_token, input)
|
381
|
+
url = URI.parse(server)
|
382
|
+
headers = { 'Authorization' => "Bearer #{bearer_token}", 'Content-Type' => 'application/json' }
|
383
|
+
body = { inputs: input }.to_json
|
384
|
+
|
385
|
+
response = HTTP.headers(headers).post(url, body: body)
|
386
|
+
response_body = JSON.parse(response.body)
|
387
|
+
|
388
|
+
{ input: input, result: response_body }
|
389
|
+
end
|
390
|
+
end
|
391
|
+
end
|
metadata
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: google-local-results-ai-parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Emirhan Akdeniz
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2023-06-13 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: A gem to be used with serpapi/bert-base-local-results model to predict
|
14
|
+
different parts of Google Local Listings.
|
15
|
+
email: kagermanovtalks@gmail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- lib/google-local-results-ai-parser.rb
|
21
|
+
homepage: https://github.com/serpapi/google-local-results-ai-parser
|
22
|
+
licenses:
|
23
|
+
- MIT
|
24
|
+
metadata: {}
|
25
|
+
post_install_message:
|
26
|
+
rdoc_options: []
|
27
|
+
require_paths:
|
28
|
+
- lib
|
29
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
requirements: []
|
40
|
+
rubygems_version: 3.1.4
|
41
|
+
signing_key:
|
42
|
+
specification_version: 4
|
43
|
+
summary: A gem to be used with serpapi/bert-base-local-results model to predict different
|
44
|
+
parts of Google Local Listings.
|
45
|
+
test_files: []
|