google-local-results-ai-parser 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: ee56950fd52aa6f87345bd757703be624ef3c24fd64b56e7bc62940d25af249d
4
+ data.tar.gz: 589f573aaf3f77042e97ab6271ba8ff8a29a0329fe5caa669aee49a4198df9c1
5
+ SHA512:
6
+ metadata.gz: 78479818c57185517acba5e74130c58a0ae0274d6612bb154043e4e9b2a5912a984ce47c10c8fdf10963f3df543db8142d94f1172c92ad58fc848dd2e29a661a
7
+ data.tar.gz: a9ad40b52b9d12786da0d8e4c4d78186b05e9000ce51e9a09968a36dd02970b18fd5d554543b3a22e00cefccba85826df1240fd42c3304985f9e0c17ebac7470
@@ -0,0 +1,391 @@
1
+ require 'nokolexbor'
2
+ require 'http'
3
+ require 'parallel'
4
+ require 'json'
5
+
6
+ module GoogleLocalResultsAiParser
7
+ DEFAULT_SERVER = 'https://api-inference.huggingface.co/models/serpapi/bert-base-local-results'.freeze
8
+ DEFAULT_SEPARATOR_REGEX = /\n|·|⋅/.freeze
9
+ DEFAULT_REJECTED_CSS = "[role='heading'], a[ping], [class*='label']".freeze
10
+ DEFAULT_BROKEN_CSS = "b:has(::text)".freeze
11
+ DEFAULT_MAX_ITERATION = 1
12
+
13
+ class CustomError < StandardError
14
+ attr_reader :message
15
+
16
+ def initialize(message = "\nThere is a problem with the connection to the server. Try setting up a private server or configure your server credentials.\nIf you are using the public endpoint, you may wait for the model to load.")
17
+ @message = message
18
+ super
19
+ end
20
+
21
+ def to_s
22
+ "#{self.class}: #{message}"
23
+ end
24
+ end
25
+
26
+ class << self
27
+ def parse_multiple(html_parts: nil, bearer_token: nil, server: DEFAULT_SERVER, separator_regex: DEFAULT_SEPARATOR_REGEX, rejected_css: DEFAULT_REJECTED_CSS, broken_css: DEFAULT_BROKEN_CSS, iteration: DEFAULT_MAX_ITERATION)
28
+ response_bodies = Parallel.map(html_parts, in_threads: html_parts.size) do |html|
29
+ parse(html: html, bearer_token: bearer_token, server: server, separator_regex: separator_regex, rejected_css: rejected_css, broken_css: DEFAULT_BROKEN_CSS, iteration: DEFAULT_MAX_ITERATION)
30
+ end
31
+ end
32
+
33
+ def parse(html: nil, bearer_token: nil, server: DEFAULT_SERVER, separator_regex: DEFAULT_SEPARATOR_REGEX, rejected_css: DEFAULT_REJECTED_CSS, broken_css: DEFAULT_BROKEN_CSS, iteration: DEFAULT_MAX_ITERATION)
34
+ doc = Nokolexbor::HTML(html)
35
+
36
+ # Rejecting title, buttons, and label
37
+ doc.css(rejected_css).remove
38
+
39
+ # Breaking down bold text to reduce noise
40
+ doc.css(DEFAULT_BROKEN_CSS).each { |b| b.parent.replace(Nokolexbor::Text.new(b.parent.text, doc)) }
41
+
42
+ # Separating and cleaning the text
43
+ unsplit_text = doc.at_css('html').text
44
+ extracted_text = doc.css("::text").map {|part| part.text.strip}.compact.join("\n")
45
+ split_text = extracted_text.split(separator_regex)
46
+ cleaned_text = split_text.map(&:strip).reject(&:empty?).flatten
47
+
48
+ # Making parallel requests to server for classification
49
+ results = parallel_post_requests(server, bearer_token, cleaned_text)
50
+
51
+ # After-fix and sorting of results
52
+ results = sort_results(results, extracted_text, unsplit_text, iteration, doc)
53
+ final_results = transform_hash(results, unsplit_text)
54
+ final_results
55
+ end
56
+
57
+ def transform_hash(results, unsplit_text)
58
+ # Transforming the final results into a hash with classifications
59
+ final_results = {}
60
+ results.each do |result|
61
+ label = result[:result][0][0]["label"]
62
+ value = result[:input]
63
+
64
+ if final_results[label]
65
+ # Combine the text for same elements
66
+ final_results[label] = unsplit_text[/#{final_results[label]}.+#{value}/]
67
+ else
68
+ # Directly assign values
69
+ final_results[label] = value
70
+ end
71
+ end
72
+
73
+ final_results
74
+ end
75
+
76
+ def sort_results(results, extracted_text, unsplit_text, iteration, doc)
77
+ # Make at most 2 iterations for after-corrections
78
+ (0..iteration).each do |i|
79
+ begin
80
+ # Check if some results contain clashes, or need to be merged
81
+ label_order = results.map {|result| result[:result][0][0]["label"]}
82
+ rescue
83
+ raise CustomError
84
+ end
85
+
86
+ # Safety measures
87
+ results, label_order = check_if_button_text(results, label_order, doc)
88
+
89
+ # Find duplicates
90
+ duplicates = find_duplicates(label_order)
91
+
92
+ # Known clashes
93
+ results, label_order, duplicates = service_options_as_type_confusion(results, label_order, duplicates)
94
+ results, label_order, duplicates = description_as_hours_confusion(results, label_order, duplicates)
95
+ results, label_order, duplicates = description_as_type_confusion(results, label_order, duplicates)
96
+ results, label_order, duplicates = reviews_as_rating_confusion(results, label_order, duplicates)
97
+ results, label_order, duplicates = button_text_as_hours_confusion(results, label_order, duplicates)
98
+
99
+ # General clashes
100
+ line_result = check_if_on_different_lines(results, duplicates, unsplit_text)
101
+ duplicates.each_with_index do |duplicate, duplicate_index|
102
+ if line_result[duplicate_index] != []
103
+ # General clash
104
+ line_result[duplicate_index].each do |clash|
105
+ first_result_score = results[clash[0]][:result][0][0]["score"]
106
+ second_result_score = results[clash[1]][:result][0][0]["score"]
107
+
108
+ if first_result_score > second_result_score
109
+ clash_index = clash[1]
110
+ else
111
+ clash_index = clash[0]
112
+ end
113
+
114
+ # Zero out the false classification, and put it to last position
115
+ primary_prediction = results[clash_index][:result][0][0]
116
+ primary_prediction["score"] = 0.0
117
+ second_prediction = results[clash_index][:result][0][1]
118
+ results[clash_index][:result][0][0] = second_prediction
119
+ results[clash_index][:result][0].delete_at(1)
120
+ results[clash_index][:result][0] << primary_prediction
121
+ end
122
+ end
123
+ end
124
+
125
+ # Check one more time to see if there's any clashes left
126
+ label_order = results.map {|result| result[:result][0][0]["label"]}
127
+ duplicates = find_duplicates(label_order)
128
+ line_result = check_if_on_different_lines(results, duplicates, unsplit_text)
129
+ no_clashes = line_result.all? { |sub_array| sub_array.empty? }
130
+
131
+ if no_clashes
132
+ break
133
+ end
134
+ end
135
+
136
+ results
137
+ end
138
+
139
+ # Items on different lines will be combined in `unsplit_text`.
140
+ # We can make combinations of 2 to eliminate the bad weed.
141
+ def check_if_on_different_lines(results, duplicates, unsplit_text)
142
+ line_result = []
143
+ duplicates.each do |duplicate|
144
+ combinations = duplicate.each_cons(2).to_a
145
+
146
+ sub_result = []
147
+
148
+ combinations.each do |combination|
149
+ combined_text = combination.map {|index| "#{results[index][:input]}"}.join
150
+ sub_result << combination if unsplit_text.include?(combined_text)
151
+ end
152
+
153
+ line_result << sub_result
154
+ end
155
+
156
+ line_result
157
+ end
158
+
159
+ # Find duplicate labels and group them
160
+ def find_duplicates(label_order)
161
+ indices = []
162
+ label_order.each_with_index do |label, index|
163
+ common_indices = label_order.map.with_index do |compared_label, compared_index|
164
+ if compared_label == label && compared_index != index && !indices.flatten.include?(index)
165
+ compared_index
166
+ end
167
+ end.compact
168
+
169
+ if common_indices != []
170
+ indices << [index, common_indices].flatten
171
+ end
172
+ end
173
+
174
+ indices
175
+ end
176
+
177
+ # Double checking residue button text
178
+ # The model hasn't encountered this behaviour.
179
+ # This is a safety measure.
180
+ def check_if_button_text(results, label_order, doc)
181
+ return results, label_order unless label_order.include?("button text")
182
+
183
+ button_indices = label_order.map.with_index {|label, index| index if label == "button text"}.compact
184
+ button_results = []
185
+
186
+ button_indices.each do |button_index|
187
+ button_result = results[button_index]
188
+ button_text = results[button_index][:input]
189
+ has_button_text = doc.css("[href], [ping]").any? {|element| element.text.include?(button_text)}
190
+
191
+ if has_button_text
192
+ # If it is really a button text inside a link
193
+ button_results << button_result
194
+ else
195
+ # Zero out the `button text`, and put it to last position
196
+ results[button_index][:result][0][0] = results[button_index][:result][0][1]
197
+ results[button_index][:result][0].delete_at(1)
198
+ button_result[:result][0][0]["score"] = 0.0
199
+ results[button_index][:result][0] << button_result[:result][0][0]
200
+ label_order[button_index] = results[button_index][:result][0][0]["label"]
201
+ end
202
+ end
203
+
204
+ # Clear the buttons
205
+ button_results.each do |button_result|
206
+ results.delete(button_result)
207
+ end
208
+
209
+ # Clear the labels
210
+ label_order.delete_if {|label| label == "button text"}
211
+
212
+ return results, label_order
213
+ end
214
+
215
+ def button_text_as_hours_confusion(results, label_order, duplicates)
216
+ known_errors = ["Expand more"]
217
+ confusion_condition = results.any? {|result| known_errors.include?(result[:input])}
218
+ return results, label_order, duplicates unless confusion_condition
219
+
220
+ hours_duplicate = duplicates.find.with_index do |duplicate, duplicate_index|
221
+ if results[duplicate[0]][:result][0][0]["label"] == "hours"
222
+ duplicate_index
223
+ end
224
+ end
225
+
226
+ # Delete the known button text directly
227
+ results.delete_at(hours_duplicate[-1])
228
+
229
+ # Rearranging `label_order`
230
+ label_order.delete_at(hours_duplicate[-1])
231
+
232
+ # Rearranging duplicates
233
+ last_item = duplicates[duplicates.index(hours_duplicate)][-1]
234
+ duplicates[duplicates.index(hours_duplicate)].delete(last_item)
235
+
236
+ if (duplicate_arr = duplicates[duplicates.index(hours_duplicate)]) && duplicate_arr.size == 1
237
+ duplicates.delete(duplicate_arr)
238
+ end
239
+
240
+ return results, label_order, duplicates
241
+ end
242
+
243
+ # 3.4 .. (1.4K)
244
+ # Fixes `(1.4K)`
245
+ def reviews_as_rating_confusion(results, label_order, duplicates)
246
+ rating_duplicate = duplicates.find.with_index do |duplicate, duplicate_index|
247
+ if results[duplicate[0]][:result][0][0]["label"] == "rating"
248
+ duplicate_index
249
+ end
250
+ end
251
+
252
+ if rating_duplicate && results[rating_duplicate[-1]][:input][/\(\d+\.\d+\w\)/]
253
+ # Zero out the `rating`, and put it to last position
254
+ reviews_hash = results[rating_duplicate[-1]][:result][0].find {|hash| hash["label"] == "reviews" }
255
+ reviews_index = results[rating_duplicate[-1]][:result][0].index(reviews_hash)
256
+ results[rating_duplicate[-1]][:result][0][0] = {"label" => "reviews", "score" => 1.0}
257
+ results[rating_duplicate[-1]][:result][0].delete_at(reviews_index)
258
+ results[rating_duplicate[-1]][:result][0] << {"label" => "rating", "score" => 0.0}
259
+
260
+ # Rearranging `label_order`
261
+ label_order[rating_duplicate[-1]] = "reviews"
262
+
263
+ # Rearranging duplicates
264
+ last_item = duplicates[duplicates.index(rating_duplicate)][-1]
265
+ duplicates[duplicates.index(rating_duplicate)].delete(last_item)
266
+
267
+ if (duplicate_arr = duplicates[duplicates.index(rating_duplicate)]) && duplicate_arr.size == 1
268
+ duplicates.delete(duplicate_arr)
269
+ end
270
+ end
271
+
272
+ return results, label_order, duplicates
273
+ end
274
+
275
+ # Coffee shop ... Iconic Seattle-based coffeehouse chain
276
+ # Fixes `Iconic Seattle-based coffeehouse chain`
277
+ def description_as_type_confusion(results, label_order, duplicates)
278
+ return results, label_order, duplicates if label_order[-1] != "type"
279
+
280
+ # Zero out the `type`, and put it to last position
281
+ description_hash = results[-1][:result][0].find {|hash| hash["label"] == "description" }
282
+ description_index = results[-1][:result][0].index(description_hash)
283
+ results[-1][:result][0][0] = {"label" => "description", "score" => 1.0}
284
+ results[-1][:result][0].delete_at(description_index)
285
+ results[-1][:result][0] << {"label" => "type", "score" => 0.0}
286
+
287
+ # Rearranging `label_order`
288
+ label_order[-1] = "description"
289
+
290
+ # Rearranging duplicates if there's any duplication
291
+ if duplicates.flatten.include?(label_order.size - 1)
292
+ type_duplicate = duplicates.find {|duplicate| duplicate.include?(label_order.size - 1)}
293
+ last_item = duplicates[duplicates.index(type_duplicate)][-1]
294
+ duplicates[duplicates.index(type_duplicate)].delete(last_item)
295
+
296
+ if (duplicate_arr = duplicates[duplicates.index(type_duplicate)]) && duplicate_arr.size == 1
297
+ duplicates.delete(duplicate_arr)
298
+ end
299
+ end
300
+
301
+ return results, label_order, duplicates
302
+ end
303
+
304
+ # Drive through: Open ⋅ Closes 12 AM
305
+ # Fixes `Closes 12 AM``
306
+ def description_as_hours_confusion(results, label_order, duplicates)
307
+ description_index = label_order.index("description")
308
+ hours_index = label_order.index("hours")
309
+
310
+ # Description may or may not be a duplicate.
311
+ # This is a known error from the model, so it has to be handled in any case.
312
+ if description_index && hours_index && description_index + 1 == hours_index
313
+ # Zero out the `hours`, and put it to last position
314
+ description_hash = results[hours_index][:result][0].find {|hash| hash["label"] == "description" }
315
+ description_index = results[hours_index][:result][0].index(description_hash)
316
+ results[hours_index][:result][0][0] = {"label" => "description", "score" => 1.0}
317
+ results[hours_index][:result][0].delete_at(description_index)
318
+ results[hours_index][:result][0] << {"label" => "hours", "score" => 0.0}
319
+
320
+ # Rearranging `label_order`
321
+ label_order[hours_index] = "description"
322
+
323
+ # Rearranging duplicates if there's any duplication
324
+ if duplicates.flatten.include?(hours_index)
325
+ hours_duplicate = duplicates.find {|duplicate| duplicate.include?(hours_index)}
326
+ last_item = duplicates[duplicates.index(hours_duplicate)][-1]
327
+ duplicates[duplicates.index(hours_duplicate)].delete(last_item)
328
+
329
+ if (duplicate_arr = duplicates[duplicates.index(hours_duplicate)]) && duplicate_arr.size == 1
330
+ duplicates.delete(duplicate_arr)
331
+ end
332
+ end
333
+ end
334
+
335
+ return results, label_order, duplicates
336
+ end
337
+
338
+ # Takeaway ⋅ Dine-in ...
339
+ # Fixes `Takeaway`
340
+ def service_options_as_type_confusion(results, label_order, duplicates)
341
+ type_duplicate = duplicates.find.with_index do |duplicate, duplicate_index|
342
+ if results[duplicate[0]][:result][0][0]["label"] == "type"
343
+ duplicate_index
344
+ end
345
+ end
346
+
347
+ if type_duplicate && (adjacent_item = results[type_duplicate[-1] + 1]) && adjacent_item[:result][0][0]["label"] == "service options"
348
+ # Zero out the `type`, and put it to last position
349
+ service_options_hash = results[type_duplicate[-1]][:result][0].find {|hash| hash["label"] == "service options" }
350
+ service_options_index = results[type_duplicate[-1]][:result][0].index(service_options_hash)
351
+ results[type_duplicate[-1]][:result][0][0] = {"label" => "service options", "score" => 1.0}
352
+ results[type_duplicate[-1]][:result][0].delete_at(service_options_index)
353
+ results[type_duplicate[-1]][:result][0] << {"label" => "type", "score" => 0.0}
354
+
355
+ # Rearranging `label_order`
356
+ label_order[type_duplicate[-1]] = "service_options"
357
+
358
+ # Rearranging duplicates
359
+ last_item = duplicates[duplicates.index(type_duplicate)][-1]
360
+ duplicates[duplicates.index(type_duplicate)].delete(last_item)
361
+
362
+ if (duplicate_arr = duplicates[duplicates.index(type_duplicate)]) && duplicate_arr.size == 1
363
+ duplicates.delete(duplicate_arr)
364
+ end
365
+ end
366
+
367
+ return results, label_order, duplicates
368
+ end
369
+
370
+ private
371
+
372
+ def parallel_post_requests(server, bearer_token, inputs)
373
+ response_bodies = Parallel.map(inputs, in_threads: inputs.size) do |input|
374
+ post_request(server, bearer_token, input)
375
+ end
376
+
377
+ response_bodies
378
+ end
379
+
380
+ def post_request(server, bearer_token, input)
381
+ url = URI.parse(server)
382
+ headers = { 'Authorization' => "Bearer #{bearer_token}", 'Content-Type' => 'application/json' }
383
+ body = { inputs: input }.to_json
384
+
385
+ response = HTTP.headers(headers).post(url, body: body)
386
+ response_body = JSON.parse(response.body)
387
+
388
+ { input: input, result: response_body }
389
+ end
390
+ end
391
+ end
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: google-local-results-ai-parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Emirhan Akdeniz
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2023-06-13 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: A gem to be used with serpapi/bert-base-local-results model to predict
14
+ different parts of Google Local Listings.
15
+ email: kagermanovtalks@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/google-local-results-ai-parser.rb
21
+ homepage: https://github.com/serpapi/google-local-results-ai-parser
22
+ licenses:
23
+ - MIT
24
+ metadata: {}
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ requirements: []
40
+ rubygems_version: 3.1.4
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: A gem to be used with serpapi/bert-base-local-results model to predict different
44
+ parts of Google Local Listings.
45
+ test_files: []