google-local-results-ai-parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: ee56950fd52aa6f87345bd757703be624ef3c24fd64b56e7bc62940d25af249d
4
+ data.tar.gz: 589f573aaf3f77042e97ab6271ba8ff8a29a0329fe5caa669aee49a4198df9c1
5
+ SHA512:
6
+ metadata.gz: 78479818c57185517acba5e74130c58a0ae0274d6612bb154043e4e9b2a5912a984ce47c10c8fdf10963f3df543db8142d94f1172c92ad58fc848dd2e29a661a
7
+ data.tar.gz: a9ad40b52b9d12786da0d8e4c4d78186b05e9000ce51e9a09968a36dd02970b18fd5d554543b3a22e00cefccba85826df1240fd42c3304985f9e0c17ebac7470
@@ -0,0 +1,391 @@
1
+ require 'nokolexbor'
2
+ require 'http'
3
+ require 'parallel'
4
+ require 'json'
5
+
6
+ module GoogleLocalResultsAiParser
7
+ DEFAULT_SERVER = 'https://api-inference.huggingface.co/models/serpapi/bert-base-local-results'.freeze
8
+ DEFAULT_SEPARATOR_REGEX = /\n|·|⋅/.freeze
9
+ DEFAULT_REJECTED_CSS = "[role='heading'], a[ping], [class*='label']".freeze
10
+ DEFAULT_BROKEN_CSS = "b:has(::text)".freeze
11
+ DEFAULT_MAX_ITERATION = 1
12
+
13
+ class CustomError < StandardError
14
+ attr_reader :message
15
+
16
+ def initialize(message = "\nThere is a problem with the connection to the server. Try setting up a private server or configure your server credentials.\nIf you are using the public endpoint, you may wait for the model to load.")
17
+ @message = message
18
+ super
19
+ end
20
+
21
+ def to_s
22
+ "#{self.class}: #{message}"
23
+ end
24
+ end
25
+
26
+ class << self
27
+ def parse_multiple(html_parts: nil, bearer_token: nil, server: DEFAULT_SERVER, separator_regex: DEFAULT_SEPARATOR_REGEX, rejected_css: DEFAULT_REJECTED_CSS, broken_css: DEFAULT_BROKEN_CSS, iteration: DEFAULT_MAX_ITERATION)
28
+ response_bodies = Parallel.map(html_parts, in_threads: html_parts.size) do |html|
29
+ parse(html: html, bearer_token: bearer_token, server: server, separator_regex: separator_regex, rejected_css: rejected_css, broken_css: DEFAULT_BROKEN_CSS, iteration: DEFAULT_MAX_ITERATION)
30
+ end
31
+ end
32
+
33
+ def parse(html: nil, bearer_token: nil, server: DEFAULT_SERVER, separator_regex: DEFAULT_SEPARATOR_REGEX, rejected_css: DEFAULT_REJECTED_CSS, broken_css: DEFAULT_BROKEN_CSS, iteration: DEFAULT_MAX_ITERATION)
34
+ doc = Nokolexbor::HTML(html)
35
+
36
+ # Rejecting title, buttons, and label
37
+ doc.css(rejected_css).remove
38
+
39
+ # Breaking down bold text to reduce noise
40
+ doc.css(DEFAULT_BROKEN_CSS).each { |b| b.parent.replace(Nokolexbor::Text.new(b.parent.text, doc)) }
41
+
42
+ # Separating and cleaning the text
43
+ unsplit_text = doc.at_css('html').text
44
+ extracted_text = doc.css("::text").map {|part| part.text.strip}.compact.join("\n")
45
+ split_text = extracted_text.split(separator_regex)
46
+ cleaned_text = split_text.map(&:strip).reject(&:empty?).flatten
47
+
48
+ # Making parallel requests to server for classification
49
+ results = parallel_post_requests(server, bearer_token, cleaned_text)
50
+
51
+ # After-fix and sorting of results
52
+ results = sort_results(results, extracted_text, unsplit_text, iteration, doc)
53
+ final_results = transform_hash(results, unsplit_text)
54
+ final_results
55
+ end
56
+
57
+ def transform_hash(results, unsplit_text)
58
+ # Transforming the final results into a hash with classifications
59
+ final_results = {}
60
+ results.each do |result|
61
+ label = result[:result][0][0]["label"]
62
+ value = result[:input]
63
+
64
+ if final_results[label]
65
+ # Combine the text for same elements
66
+ final_results[label] = unsplit_text[/#{final_results[label]}.+#{value}/]
67
+ else
68
+ # Directly assign values
69
+ final_results[label] = value
70
+ end
71
+ end
72
+
73
+ final_results
74
+ end
75
+
76
+ def sort_results(results, extracted_text, unsplit_text, iteration, doc)
77
+ # Make at most 2 iterations for after-corrections
78
+ (0..iteration).each do |i|
79
+ begin
80
+ # Check if some results contain clashes, or need to be merged
81
+ label_order = results.map {|result| result[:result][0][0]["label"]}
82
+ rescue
83
+ raise CustomError
84
+ end
85
+
86
+ # Safety measures
87
+ results, label_order = check_if_button_text(results, label_order, doc)
88
+
89
+ # Find duplicates
90
+ duplicates = find_duplicates(label_order)
91
+
92
+ # Known clashes
93
+ results, label_order, duplicates = service_options_as_type_confusion(results, label_order, duplicates)
94
+ results, label_order, duplicates = description_as_hours_confusion(results, label_order, duplicates)
95
+ results, label_order, duplicates = description_as_type_confusion(results, label_order, duplicates)
96
+ results, label_order, duplicates = reviews_as_rating_confusion(results, label_order, duplicates)
97
+ results, label_order, duplicates = button_text_as_hours_confusion(results, label_order, duplicates)
98
+
99
+ # General clashes
100
+ line_result = check_if_on_different_lines(results, duplicates, unsplit_text)
101
+ duplicates.each_with_index do |duplicate, duplicate_index|
102
+ if line_result[duplicate_index] != []
103
+ # General clash
104
+ line_result[duplicate_index].each do |clash|
105
+ first_result_score = results[clash[0]][:result][0][0]["score"]
106
+ second_result_score = results[clash[1]][:result][0][0]["score"]
107
+
108
+ if first_result_score > second_result_score
109
+ clash_index = clash[1]
110
+ else
111
+ clash_index = clash[0]
112
+ end
113
+
114
+ # Zero out the false classification, and put it to last position
115
+ primary_prediction = results[clash_index][:result][0][0]
116
+ primary_prediction["score"] = 0.0
117
+ second_prediction = results[clash_index][:result][0][1]
118
+ results[clash_index][:result][0][0] = second_prediction
119
+ results[clash_index][:result][0].delete_at(1)
120
+ results[clash_index][:result][0] << primary_prediction
121
+ end
122
+ end
123
+ end
124
+
125
+ # Check one more time to see if there's any clashes left
126
+ label_order = results.map {|result| result[:result][0][0]["label"]}
127
+ duplicates = find_duplicates(label_order)
128
+ line_result = check_if_on_different_lines(results, duplicates, unsplit_text)
129
+ no_clashes = line_result.all? { |sub_array| sub_array.empty? }
130
+
131
+ if no_clashes
132
+ break
133
+ end
134
+ end
135
+
136
+ results
137
+ end
138
+
139
+ # Items on different lines will be combined in `unsplit_text`.
140
+ # We can make combinations of 2 to eliminate the bad weed.
141
+ def check_if_on_different_lines(results, duplicates, unsplit_text)
142
+ line_result = []
143
+ duplicates.each do |duplicate|
144
+ combinations = duplicate.each_cons(2).to_a
145
+
146
+ sub_result = []
147
+
148
+ combinations.each do |combination|
149
+ combined_text = combination.map {|index| "#{results[index][:input]}"}.join
150
+ sub_result << combination if unsplit_text.include?(combined_text)
151
+ end
152
+
153
+ line_result << sub_result
154
+ end
155
+
156
+ line_result
157
+ end
158
+
159
+ # Find duplicate labels and group them
160
+ def find_duplicates(label_order)
161
+ indices = []
162
+ label_order.each_with_index do |label, index|
163
+ common_indices = label_order.map.with_index do |compared_label, compared_index|
164
+ if compared_label == label && compared_index != index && !indices.flatten.include?(index)
165
+ compared_index
166
+ end
167
+ end.compact
168
+
169
+ if common_indices != []
170
+ indices << [index, common_indices].flatten
171
+ end
172
+ end
173
+
174
+ indices
175
+ end
176
+
177
+ # Double checking residue button text
178
+ # The model hasn't encountered this behaviour.
179
+ # This is a safety measure.
180
+ def check_if_button_text(results, label_order, doc)
181
+ return results, label_order unless label_order.include?("button text")
182
+
183
+ button_indices = label_order.map.with_index {|label, index| index if label == "button text"}.compact
184
+ button_results = []
185
+
186
+ button_indices.each do |button_index|
187
+ button_result = results[button_index]
188
+ button_text = results[button_index][:input]
189
+ has_button_text = doc.css("[href], [ping]").any? {|element| element.text.include?(button_text)}
190
+
191
+ if has_button_text
192
+ # If it is really a button text inside a link
193
+ button_results << button_result
194
+ else
195
+ # Zero out the `button text`, and put it to last position
196
+ results[button_index][:result][0][0] = results[button_index][:result][0][1]
197
+ results[button_index][:result][0].delete_at(1)
198
+ button_result[:result][0][0]["score"] = 0.0
199
+ results[button_index][:result][0] << button_result[:result][0][0]
200
+ label_order[button_index] = results[button_index][:result][0][0]["label"]
201
+ end
202
+ end
203
+
204
+ # Clear the buttons
205
+ button_results.each do |button_result|
206
+ results.delete(button_result)
207
+ end
208
+
209
+ # Clear the labels
210
+ label_order.delete_if {|label| label == "button text"}
211
+
212
+ return results, label_order
213
+ end
214
+
215
+ def button_text_as_hours_confusion(results, label_order, duplicates)
216
+ known_errors = ["Expand more"]
217
+ confusion_condition = results.any? {|result| known_errors.include?(result[:input])}
218
+ return results, label_order, duplicates unless confusion_condition
219
+
220
+ hours_duplicate = duplicates.find.with_index do |duplicate, duplicate_index|
221
+ if results[duplicate[0]][:result][0][0]["label"] == "hours"
222
+ duplicate_index
223
+ end
224
+ end
225
+
226
+ # Delete the known button text directly
227
+ results.delete_at(hours_duplicate[-1])
228
+
229
+ # Rearranging `label_order`
230
+ label_order.delete_at(hours_duplicate[-1])
231
+
232
+ # Rearranging duplicates
233
+ last_item = duplicates[duplicates.index(hours_duplicate)][-1]
234
+ duplicates[duplicates.index(hours_duplicate)].delete(last_item)
235
+
236
+ if (duplicate_arr = duplicates[duplicates.index(hours_duplicate)]) && duplicate_arr.size == 1
237
+ duplicates.delete(duplicate_arr)
238
+ end
239
+
240
+ return results, label_order, duplicates
241
+ end
242
+
243
+ # 3.4 .. (1.4K)
244
+ # Fixes `(1.4K)`
245
+ def reviews_as_rating_confusion(results, label_order, duplicates)
246
+ rating_duplicate = duplicates.find.with_index do |duplicate, duplicate_index|
247
+ if results[duplicate[0]][:result][0][0]["label"] == "rating"
248
+ duplicate_index
249
+ end
250
+ end
251
+
252
+ if rating_duplicate && results[rating_duplicate[-1]][:input][/\(\d+\.\d+\w\)/]
253
+ # Zero out the `rating`, and put it to last position
254
+ reviews_hash = results[rating_duplicate[-1]][:result][0].find {|hash| hash["label"] == "reviews" }
255
+ reviews_index = results[rating_duplicate[-1]][:result][0].index(reviews_hash)
256
+ results[rating_duplicate[-1]][:result][0][0] = {"label" => "reviews", "score" => 1.0}
257
+ results[rating_duplicate[-1]][:result][0].delete_at(reviews_index)
258
+ results[rating_duplicate[-1]][:result][0] << {"label" => "rating", "score" => 0.0}
259
+
260
+ # Rearranging `label_order`
261
+ label_order[rating_duplicate[-1]] = "reviews"
262
+
263
+ # Rearranging duplicates
264
+ last_item = duplicates[duplicates.index(rating_duplicate)][-1]
265
+ duplicates[duplicates.index(rating_duplicate)].delete(last_item)
266
+
267
+ if (duplicate_arr = duplicates[duplicates.index(rating_duplicate)]) && duplicate_arr.size == 1
268
+ duplicates.delete(duplicate_arr)
269
+ end
270
+ end
271
+
272
+ return results, label_order, duplicates
273
+ end
274
+
275
+ # Coffee shop ... Iconic Seattle-based coffeehouse chain
276
+ # Fixes `Iconic Seattle-based coffeehouse chain`
277
+ def description_as_type_confusion(results, label_order, duplicates)
278
+ return results, label_order, duplicates if label_order[-1] != "type"
279
+
280
+ # Zero out the `type`, and put it to last position
281
+ description_hash = results[-1][:result][0].find {|hash| hash["label"] == "description" }
282
+ description_index = results[-1][:result][0].index(description_hash)
283
+ results[-1][:result][0][0] = {"label" => "description", "score" => 1.0}
284
+ results[-1][:result][0].delete_at(description_index)
285
+ results[-1][:result][0] << {"label" => "type", "score" => 0.0}
286
+
287
+ # Rearranging `label_order`
288
+ label_order[-1] = "description"
289
+
290
+ # Rearranging duplicates if there's any duplication
291
+ if duplicates.flatten.include?(label_order.size - 1)
292
+ type_duplicate = duplicates.find {|duplicate| duplicate.include?(label_order.size - 1)}
293
+ last_item = duplicates[duplicates.index(type_duplicate)][-1]
294
+ duplicates[duplicates.index(type_duplicate)].delete(last_item)
295
+
296
+ if (duplicate_arr = duplicates[duplicates.index(type_duplicate)]) && duplicate_arr.size == 1
297
+ duplicates.delete(duplicate_arr)
298
+ end
299
+ end
300
+
301
+ return results, label_order, duplicates
302
+ end
303
+
304
+ # Drive through: Open ⋅ Closes 12 AM
305
+ # Fixes `Closes 12 AM``
306
+ def description_as_hours_confusion(results, label_order, duplicates)
307
+ description_index = label_order.index("description")
308
+ hours_index = label_order.index("hours")
309
+
310
+ # Description may or may not be a duplicate.
311
+ # This is a known error from the model, so it has to be handled in any case.
312
+ if description_index && hours_index && description_index + 1 == hours_index
313
+ # Zero out the `hours`, and put it to last position
314
+ description_hash = results[hours_index][:result][0].find {|hash| hash["label"] == "description" }
315
+ description_index = results[hours_index][:result][0].index(description_hash)
316
+ results[hours_index][:result][0][0] = {"label" => "description", "score" => 1.0}
317
+ results[hours_index][:result][0].delete_at(description_index)
318
+ results[hours_index][:result][0] << {"label" => "hours", "score" => 0.0}
319
+
320
+ # Rearranging `label_order`
321
+ label_order[hours_index] = "description"
322
+
323
+ # Rearranging duplicates if there's any duplication
324
+ if duplicates.flatten.include?(hours_index)
325
+ hours_duplicate = duplicates.find {|duplicate| duplicate.include?(hours_index)}
326
+ last_item = duplicates[duplicates.index(hours_duplicate)][-1]
327
+ duplicates[duplicates.index(hours_duplicate)].delete(last_item)
328
+
329
+ if (duplicate_arr = duplicates[duplicates.index(hours_duplicate)]) && duplicate_arr.size == 1
330
+ duplicates.delete(duplicate_arr)
331
+ end
332
+ end
333
+ end
334
+
335
+ return results, label_order, duplicates
336
+ end
337
+
338
+ # Takeaway ⋅ Dine-in ...
339
+ # Fixes `Takeaway`
340
+ def service_options_as_type_confusion(results, label_order, duplicates)
341
+ type_duplicate = duplicates.find.with_index do |duplicate, duplicate_index|
342
+ if results[duplicate[0]][:result][0][0]["label"] == "type"
343
+ duplicate_index
344
+ end
345
+ end
346
+
347
+ if type_duplicate && (adjacent_item = results[type_duplicate[-1] + 1]) && adjacent_item[:result][0][0]["label"] == "service options"
348
+ # Zero out the `type`, and put it to last position
349
+ service_options_hash = results[type_duplicate[-1]][:result][0].find {|hash| hash["label"] == "service options" }
350
+ service_options_index = results[type_duplicate[-1]][:result][0].index(service_options_hash)
351
+ results[type_duplicate[-1]][:result][0][0] = {"label" => "service options", "score" => 1.0}
352
+ results[type_duplicate[-1]][:result][0].delete_at(service_options_index)
353
+ results[type_duplicate[-1]][:result][0] << {"label" => "type", "score" => 0.0}
354
+
355
+ # Rearranging `label_order`
356
+ label_order[type_duplicate[-1]] = "service_options"
357
+
358
+ # Rearranging duplicates
359
+ last_item = duplicates[duplicates.index(type_duplicate)][-1]
360
+ duplicates[duplicates.index(type_duplicate)].delete(last_item)
361
+
362
+ if (duplicate_arr = duplicates[duplicates.index(type_duplicate)]) && duplicate_arr.size == 1
363
+ duplicates.delete(duplicate_arr)
364
+ end
365
+ end
366
+
367
+ return results, label_order, duplicates
368
+ end
369
+
370
+ private
371
+
372
+ def parallel_post_requests(server, bearer_token, inputs)
373
+ response_bodies = Parallel.map(inputs, in_threads: inputs.size) do |input|
374
+ post_request(server, bearer_token, input)
375
+ end
376
+
377
+ response_bodies
378
+ end
379
+
380
+ def post_request(server, bearer_token, input)
381
+ url = URI.parse(server)
382
+ headers = { 'Authorization' => "Bearer #{bearer_token}", 'Content-Type' => 'application/json' }
383
+ body = { inputs: input }.to_json
384
+
385
+ response = HTTP.headers(headers).post(url, body: body)
386
+ response_body = JSON.parse(response.body)
387
+
388
+ { input: input, result: response_body }
389
+ end
390
+ end
391
+ end
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: google-local-results-ai-parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Emirhan Akdeniz
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2023-06-13 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: A gem to be used with serpapi/bert-base-local-results model to predict
14
+ different parts of Google Local Listings.
15
+ email: kagermanovtalks@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/google-local-results-ai-parser.rb
21
+ homepage: https://github.com/serpapi/google-local-results-ai-parser
22
+ licenses:
23
+ - MIT
24
+ metadata: {}
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ requirements: []
40
+ rubygems_version: 3.1.4
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: A gem to be used with serpapi/bert-base-local-results model to predict different
44
+ parts of Google Local Listings.
45
+ test_files: []