aicmo 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
aicmo-0.0.1/PKG-INFO ADDED
@@ -0,0 +1,34 @@
1
+ Metadata-Version: 2.4
2
+ Name: aicmo
3
+ Version: 0.0.1
4
+ Summary: A package for using aicmo functions and tools
5
+ Author: Jayr Castro
6
+ Author-email: jayrcastro.py@gmail.com
7
+ Keywords: aicmo
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Operating System :: Unix
12
+ Classifier: Operating System :: MacOS :: MacOS X
13
+ Classifier: Operating System :: Microsoft :: Windows
14
+ Description-Content-Type: text/markdown
15
+ Requires-Dist: openai==1.75.0
16
+ Requires-Dist: scrapingbee==2.0.1
17
+ Requires-Dist: requests==2.32.3
18
+ Requires-Dist: boto3==1.37.37
19
+ Requires-Dist: tiktoken==0.9.0
20
+ Requires-Dist: opencv-python-headless==4.11.0.86
21
+ Requires-Dist: beautifulsoup4==4.13.4
22
+ Requires-Dist: numpy==2.2.4
23
+ Requires-Dist: python-dotenv==1.1.0
24
+ Requires-Dist: typesense==1.0.3
25
+ Dynamic: author
26
+ Dynamic: author-email
27
+ Dynamic: classifier
28
+ Dynamic: description
29
+ Dynamic: description-content-type
30
+ Dynamic: keywords
31
+ Dynamic: requires-dist
32
+ Dynamic: summary
33
+
34
+ A package for using aicmo functions and tools, includes scraping, openai with an options where you can use it in a serverless application such as AWS Lambda and GCP Cloud Function
aicmo-0.0.1/README.md ADDED
@@ -0,0 +1,3 @@
1
+ How to setup
2
+ 1. `pip install setuptools wheel twine`
3
+ 2. `python setup.py sdist bdist_wheel`
@@ -0,0 +1,865 @@
1
+ from openai import OpenAI, types
2
+ from scrapingbee import ScrapingBeeClient
3
+ from io import BytesIO
4
+ from bs4 import BeautifulSoup
5
+ from typing import Union, Any
6
+ from base64 import b64decode
7
+ from hashlib import sha256
8
+ from urllib.parse import urlencode
9
+ import numpy as np
10
+ import boto3, json, tiktoken, pickle, requests, cv2, re, hmac, os, typesense
11
+ from typing import Any
12
+
13
+ class AICMOClient:
14
+ def __init__(
15
+ self,
16
+ aws_secret_name: str,
17
+ secret_dict: dict = {},
18
+ aws_access_key_id: str = None,
19
+ aws_secret_access_key: str = None,
20
+ aws_region_name: str = "us-east-1",
21
+ aws_s3_bucket: str = None,
22
+ tiktoken_encoding: str = "cl100k_base",
23
+ ts_host: str=None,
24
+ ts_port: int=None,
25
+ ts_api_key: str=None,
26
+ ) -> None:
27
+ """
28
+ Initialize the AICMOClient with AWS credentials and OpenAI model.
29
+ """
30
+
31
+ # Initialize AWS credentials and S3 bucket
32
+ aws_dict = {x: y for x, y in (("region_name", aws_region_name), ("aws_access_key_id", aws_access_key_id), ("aws_secret_access_key", aws_secret_access_key)) if y}
33
+ # Initialize Secrets from AWS Secrets Manager
34
+ if secret_dict:
35
+ self.secret_dict = secret_dict
36
+ else:
37
+ self.secret_dict = self.load_credentials_from_secrets_manager(aws_secret_name, aws_dict=aws_dict)
38
+ # Initialize AWS clients
39
+ self.s3_client = boto3.client('s3', **aws_dict)
40
+ self.stepfunction_client = boto3.client('stepfunctions', **aws_dict)
41
+ # Initialize AWS S3 bucket and directory
42
+ if aws_s3_bucket:
43
+ self.aws_s3_bucket = aws_s3_bucket
44
+ else:
45
+ self.aws_s3_bucket = self.secret_dict.get('AWS_S3_BUCKET', None)
46
+
47
+ # Initialize OpenAI client
48
+ openai_dict = {x:self.secret_dict[y] for x,y in (("api_key", "OPENAI_API_KEY"), ("organization", "OPENAI_ORG_KEY")) if self.secret_dict.get(y, None)}
49
+ self.openai_client = OpenAI(**openai_dict)
50
+
51
+ # Initialize ScrapingBee client
52
+ self.SCRAPINGBEE_API_KEY = self.secret_dict.get('SCRAPINGBEE_API_KEY', None)
53
+ self.scrapingbee_client = ScrapingBeeClient(api_key=self.SCRAPINGBEE_API_KEY)
54
+
55
+ # Initialize Tiktoken client
56
+ self.tiktoken_client = tiktoken.get_encoding(tiktoken_encoding)
57
+
58
+ TS_HOST = self.secret_dict.get('TS_HOST', ts_host)
59
+ TS_PORT = self.secret_dict.get("TS_PORT", ts_port)
60
+ TS_API_KEY = self.secret_dict.get("TS_API_KEY", ts_api_key)
61
+ if TS_HOST and TS_PORT and TS_API_KEY:
62
+
63
+ self.ts_client = typesense.Client({
64
+ 'nodes': [
65
+ {
66
+ 'host': TS_HOST,
67
+ 'port': TS_PORT,
68
+ 'protocol': self.secret_dict.get("TS_PROTOCOL", 'http')
69
+ }
70
+ ],
71
+ 'api_key': TS_API_KEY,
72
+ "connection_timeout_seconds": self.secret_dict.get("TS_CONNECTION_TIMEOUT_SECONDS", 600)
73
+ })
74
+ else:
75
+ self.ts_client = None
76
+
77
+ # Initialize OpenAI model
78
+ self.OPENAI_MODEL = self.secret_dict.get('OPENAI_MODEL', None)
79
+
80
+ # Initialize Costing for per APIs
81
+ self.COST = json.loads(self.secret_dict['COST'])
82
+
83
+ @staticmethod
84
+ def load_credentials_from_secrets_manager(
85
+ aws_secret_name: str,
86
+ aws_dict: dict = {}
87
+ ) -> dict:
88
+ """
89
+ Load credentials from AWS Secrets Manager.
90
+ """
91
+ try:
92
+ secretsmanager_client = boto3.client('secretsmanager', **aws_dict)
93
+ get_secret_value_response = secretsmanager_client.get_secret_value(SecretId=aws_secret_name)
94
+ return json.loads(get_secret_value_response['SecretString'])
95
+ except Exception as e:
96
+ raise RuntimeError(f"Failed to retrieve secrets: {e}")
97
+
98
+ def tools_call_gpt(
99
+ self,
100
+ messages: list,
101
+ tools: list,
102
+ tool_name: str,
103
+ tokens: dict=None,
104
+ model: str=None,
105
+ tries: int=5
106
+ ) -> dict:
107
+ """
108
+ Call the GPT model with tools.
109
+ """
110
+ if not tokens:
111
+ tokens = self.get_empty_gpt_tokens()
112
+ if not model:
113
+ model = self.OPENAI_MODEL
114
+ ret_val = {
115
+ "completion": None,
116
+ "status": "failed",
117
+ "tokens": tokens
118
+ }
119
+ for _ in range(tries):
120
+ try:
121
+ completion = self.openai_client.chat.completions.create(
122
+ model=model,
123
+ messages=messages,
124
+ tools=tools,
125
+ tool_choice={"type": "function", "function": {"name": tool_name}}
126
+ )
127
+ tokens = self.get_gpt_tokens(completion, model, tokens)
128
+ ret_val['completion'] = completion
129
+ ret_val['status'] = "success"
130
+ ret_val['tokens'] = self.get_gpt_tokens(completion, model, tokens)
131
+ return ret_val
132
+ except Exception as e:
133
+ ret_val['errors'] = str(e)
134
+ print(e)
135
+ return ret_val
136
+
137
+ def chat_completion_gpt(
138
+ self,
139
+ messages: list,
140
+ tokens: dict=None,
141
+ model: str=None,
142
+ temperature: int=1,
143
+ tries: int=3
144
+ ) -> dict:
145
+ """
146
+ Call the GPT model for chat completion.
147
+ """
148
+ if not tokens:
149
+ tokens = self.get_empty_gpt_tokens()
150
+ if not model:
151
+ model = self.OPENAI_MODEL
152
+ ret_val = {
153
+ "completion": None,
154
+ "status": "failed",
155
+ "tokens": tokens
156
+ }
157
+ for _ in range(tries):
158
+ try:
159
+ completion = self.openai_client.chat.completions.create(
160
+ model=model,
161
+ messages=messages,
162
+ temperature=temperature,
163
+ )
164
+ ret_val['completion'] = completion
165
+ ret_val['status'] = "success"
166
+ ret_val['tokens'] = self.get_gpt_tokens(completion, model, tokens)
167
+ return ret_val
168
+ except Exception as e:
169
+ ret_val['errors'] = str(e)
170
+ print(e)
171
+ return ret_val
172
+
173
+ def get_empty_gpt_tokens(self) -> dict:
174
+ """
175
+ Initialize an empty dictionary for tokens.
176
+ """
177
+ tokens = {
178
+ "input_cost": 0,
179
+ "output_cost": 0,
180
+ "total_cost": 0,
181
+ "prompt_tokens": 0,
182
+ "completion_tokens": 0,
183
+ "total_tokens": 0
184
+ }
185
+ return tokens
186
+
187
+ def get_gpt_tokens(
188
+ self,
189
+ data: types.chat.chat_completion.ChatCompletion,
190
+ model: str,
191
+ tokens: dict
192
+ ) -> dict:
193
+ """
194
+ Calculate the token usage and cost.
195
+ """
196
+ prompt_tokens = data.usage.prompt_tokens
197
+ completion_tokens = data.usage.completion_tokens
198
+ cost = self.COST.get('openai', {}).get("texts", {}).get(model, {})
199
+ openai_input_cost = cost.get('input', None)
200
+ openai_output_cost = cost.get('output', None)
201
+ if openai_input_cost and openai_output_cost:
202
+ tokens['input_cost'] += prompt_tokens * openai_input_cost
203
+ tokens['output_cost'] += completion_tokens * openai_output_cost
204
+ tokens['total_cost'] = round(tokens['input_cost'] + tokens['output_cost'], 4)
205
+ tokens['prompt_tokens'] += prompt_tokens
206
+ tokens['completion_tokens'] += completion_tokens
207
+ tokens['total_tokens'] += prompt_tokens + completion_tokens
208
+ tokens['openai_input_cost'] = openai_input_cost
209
+ tokens['openai_output_cost'] = openai_output_cost
210
+ return tokens
211
+
212
+ def s3_upload_pickle(
213
+ self,
214
+ output: Any,
215
+ filename: str,
216
+ aws_s3_dir: str=None,
217
+ event_id: str=None,
218
+ sub_dir: str=None,
219
+ aws_s3_bucket: str=None,
220
+ **kwargs
221
+ ) -> str:
222
+ """
223
+ From any file to a pickle then gets uploaded to S3.
224
+ """
225
+ if not aws_s3_bucket:
226
+ aws_s3_bucket = self.aws_s3_bucket
227
+ s3_key = os.path.join(*[x for x in (aws_s3_dir, event_id, sub_dir, filename) if x])
228
+ pickle_buffer = BytesIO()
229
+ pickle.dump(output, pickle_buffer)
230
+ pickle_buffer.seek(0)
231
+ self.s3_client.upload_fileobj(pickle_buffer, aws_s3_bucket, s3_key)
232
+ return f"https://{aws_s3_bucket}.s3.amazonaws.com/{s3_key}"
233
+
234
+ def s3_upload_image(
235
+ self,
236
+ filename: str,
237
+ aws_s3_dir: str=None,
238
+ event_id: str=None,
239
+ sub_dir: str=None,
240
+ filepath_dir: str="/tmp",
241
+ aws_s3_bucket: str=None,
242
+ **kwargs
243
+ ) -> str:
244
+ """
245
+ Upload an image to S3.
246
+ """
247
+ if not aws_s3_bucket:
248
+ aws_s3_bucket = self.aws_s3_bucket
249
+ s3_key = os.path.join(*[x for x in (aws_s3_dir, event_id, sub_dir, filename) if x])
250
+ self.s3_client.upload_file(f"{filepath_dir}/{filename}", aws_s3_bucket, s3_key)
251
+ return f"https://{aws_s3_bucket}.s3.amazonaws.com/{s3_key}"
252
+
253
+ def s3_upload_json(
254
+ self,
255
+ data: dict,
256
+ filename: str,
257
+ aws_s3_dir: str=None,
258
+ event_id: str=None,
259
+ sub_dir: str=None,
260
+ aws_s3_bucket: str=None,
261
+ **kwargs
262
+ ) -> str:
263
+ """
264
+ Upload a JSON file to S3.
265
+ """
266
+ if not aws_s3_bucket:
267
+ aws_s3_bucket = self.aws_s3_bucket
268
+ s3_key = os.path.join(*[x for x in (aws_s3_dir, event_id, sub_dir, filename) if x])
269
+ self.s3_client.put_object(
270
+ Bucket=aws_s3_bucket,
271
+ Key=s3_key,
272
+ Body=json.dumps(data, indent=4, ensure_ascii=False),
273
+ ContentType='application/json' # Optional but recommended
274
+ )
275
+ return f"https://{aws_s3_bucket}.s3.amazonaws.com/{s3_key}"
276
+
277
+ def send_error(
278
+ self,
279
+ event: dict,
280
+ tool_name: str,
281
+ tb: str,
282
+ send_error_webhook_url: str=None
283
+ ) -> requests.Response:
284
+ if not send_error_webhook_url:
285
+ send_error_webhook_url = self.secret_dict.get('SEND_ERROR_WEBHOOK_URL', None)
286
+ """
287
+ Send an error message to a Slack webhook.
288
+ """
289
+ payload = {
290
+ "blocks": [
291
+ {
292
+ "type": "header",
293
+ "text": {
294
+ "type": "plain_text",
295
+ "text": f"Error in {tool_name} tool"
296
+ }
297
+ },
298
+ {
299
+ "type": "section",
300
+ "text": {
301
+ "type": "mrkdwn",
302
+ "text": (
303
+ f"Event ID: {event['event_id']}"
304
+ f"User ID: {event['user_id']}"
305
+ f"{tb}"
306
+ )
307
+ }
308
+ }
309
+ ]
310
+ }
311
+ # Define headers
312
+ headers = {
313
+ "Content-type": "application/json"
314
+ }
315
+ # Send POST request to Slack webhook URL
316
+ return requests.post(send_error_webhook_url, json=payload, headers=headers)
317
+
318
+ def limit_text_tokens(
319
+ self,
320
+ text: str,
321
+ max_tokens: int=10000
322
+ ) -> str:
323
+ """
324
+ Limit the number of tokens in a text string.
325
+ """
326
+ tokens = self.tiktoken_client.encode(text)
327
+ if len(tokens) > max_tokens:
328
+ tokens = tokens[:max_tokens]
329
+ return self.tiktoken_client.decode(tokens)
330
+
331
+ def scrape_scrapingbee_sdk(
332
+ self,
333
+ url: str,
334
+ tries: int=3,
335
+ decode_utf: bool=False,
336
+ timeout: int=30,
337
+ stealth_proxy: bool=False,
338
+ render_js: bool=False,
339
+ soup_convert: bool=True,
340
+ wait_browser: str='load'
341
+ ) -> Union[BeautifulSoup, str, bool]:
342
+ """
343
+ Scrape a webpage using the ScrapingBee SDK.
344
+ """
345
+ params = {
346
+ "wait_browser": wait_browser,
347
+ 'timeout': str(timeout*1000)
348
+ }
349
+ if stealth_proxy:
350
+ params['stealth_proxy'] = "true"
351
+
352
+ params['render_js'] = str(render_js).lower()
353
+ if url:
354
+ for _ in range(tries):
355
+ try:
356
+ # print("Website:", url)
357
+ response = self.scrapingbee_client.get(
358
+ url,
359
+ params=params
360
+ )
361
+ # print("Status Code:", response.status_code)
362
+ if response.ok:
363
+ if decode_utf:
364
+ content = response.content.decode("utf-8")
365
+ if soup_convert:
366
+ return BeautifulSoup(content, 'html.parser')
367
+ return content
368
+ content = response.content
369
+ if soup_convert:
370
+ return BeautifulSoup(content, 'html.parser')
371
+ return content
372
+ elif response.status_code == 500:
373
+ continue
374
+ return False
375
+ except Exception as e:
376
+ print(f"ERROR in SCRAPINGBEE SCRAPE SDK")
377
+ print(e)
378
+
379
+ def scrape_requests(
380
+ self,
381
+ url: str,
382
+ soup_convert: bool=True,
383
+ tries: int=3,
384
+ content_decode: str='utf-8'
385
+ ) -> Union[BeautifulSoup, str]:
386
+ """
387
+ Scrape a webpage using the requests library.
388
+ """
389
+ for _ in range(tries):
390
+ resp = requests.get(url)
391
+ try:
392
+ content = resp.content.decode(content_decode)
393
+ except:
394
+ content = resp.content
395
+ if soup_convert:
396
+ return BeautifulSoup(content, 'html.parser')
397
+ return content
398
+
399
+ def scrape(
400
+ self,
401
+ url: str,
402
+ soup_convert: bool=True,
403
+ tries: int=3,
404
+ content_length: int=500,
405
+ **kwargs
406
+ ) -> Union[BeautifulSoup, str]:
407
+ """
408
+ Scrape a webpage using the ScrapingBee SDK or requests library.
409
+ """
410
+ content = self.scrape_requests(url, soup_convert=soup_convert, tries=tries)
411
+ if not content or len(content.get_text(" ", strip=True)) < content_length:
412
+ print("ScrapingBee SDK")
413
+ content = self.scrape_scrapingbee_sdk(url, soup_convert=soup_convert, tries=tries, stealth_proxy=True, render_js=True, **kwargs)
414
+ return content
415
+
416
+ def clean_text(
417
+ self,
418
+ text: str
419
+ ) -> str:
420
+ """
421
+ Clean the text by removing extra spaces, newlines, and tabs.
422
+ """
423
+ text = re.sub('\n+', '\n', text).strip()
424
+ text = re.sub(r'\t+', ' ', text).strip()
425
+ text = re.sub(r' +', ' ', text).strip()
426
+ text = re.sub("\n ", "\n", text).strip()
427
+ return text
428
+
429
+
430
+ def google_search_scrapingbee(
431
+ self,
432
+ query: str,
433
+ search_type: str='classic',
434
+ page: int=1,
435
+ nb_results: int=100,
436
+ device: str='desktop',
437
+ country_code: str='us',
438
+ add_html: bool=False,
439
+ nfpr: bool=False,
440
+ language: str='en',
441
+ take_screenshot: bool=False,
442
+ tries: int=5,
443
+ **kwargs
444
+ ) -> dict:
445
+ """
446
+ Perform a Google search using the ScrapingBee API.
447
+ """
448
+ for _ in range(tries):
449
+ res = requests.get(
450
+ url='https://app.scrapingbee.com/api/v1/store/google',
451
+ params={
452
+ 'api_key': self.SCRAPINGBEE_API_KEY,
453
+ 'search': query,
454
+ 'language': language,
455
+ "search_type": search_type,
456
+ "page": page,
457
+ "nb_results": nb_results,
458
+ "device": device,
459
+ "country_code": country_code,
460
+ "add_html": add_html,
461
+ "nfpr": nfpr
462
+ },
463
+ )
464
+ if res.ok:
465
+ search_results = res.json()
466
+ if take_screenshot:
467
+ screenshot_fn =self.screenshot_google_search_scapingbee(search_results['meta_data']['url'], **kwargs)
468
+ search_results['screenshot_fn'] = screenshot_fn
469
+ return search_results
470
+
471
+
472
+ def screenshot_google_search_scapingbee(
473
+ self,
474
+ url: str,
475
+ uid: str,
476
+ save_path: str="/tmp",
477
+ country_code: str='us',
478
+ screenshot_full_page: bool=False,
479
+ max_height: int=1080,
480
+ addtl_fn: str="-google_search_screenshot",
481
+ ) -> str:
482
+ """
483
+ Take a screenshot of a Google search results page using ScrapingBee.
484
+ """
485
+ params = {
486
+ 'custom_google': True,
487
+ 'stealth_proxy': True,
488
+ 'country_code': country_code,
489
+ 'screenshot': True
490
+ }
491
+ if screenshot_full_page:
492
+ params['screenshot_full_page'] = screenshot_full_page
493
+ response = self.scrapingbee_client.get(
494
+ url,
495
+ params=params
496
+ )
497
+ if response.ok:
498
+
499
+ image_np = np.frombuffer(response.content, dtype=np.uint8)
500
+ image_cv2 = cv2.imdecode(image_np, cv2.IMREAD_COLOR)
501
+ if screenshot_full_page:
502
+ addtl_fn += "-full_page"
503
+ else:
504
+ image_cv2 = image_cv2[:max_height, :]
505
+ if addtl_fn:
506
+ filename = f"{uid}{addtl_fn}.png"
507
+ else:
508
+ filename = f"{uid}.png"
509
+ # print("Image shape:", image_cv2.shape)
510
+ # Save the image using OpenCV
511
+ filepath = f"{save_path}/{filename}"
512
+ cv2.imwrite(filepath, image_cv2)
513
+ return {
514
+ "filename": filename,
515
+ "save_path": filepath,
516
+ "viewport": {
517
+ "width": image_cv2.shape[1],
518
+ "height": image_cv2.shape[0]
519
+ }
520
+ }
521
+
522
+ def screenshot_webpage(
523
+ self,
524
+ url: str,
525
+ uid: str,
526
+ idx: str,
527
+ save_path: str="/tmp",
528
+ tries: int=3
529
+ ) -> list:
530
+ """
531
+ Take a screenshot of a webpage using the ScrapingBee API.
532
+ """
533
+ os.makedirs(save_path, exist_ok=True)
534
+ for _ in range(tries):
535
+ response2 = self.scrapingbee_client.get(
536
+ url,
537
+ params={
538
+ 'wait': '5000',
539
+ 'stealth_proxy': True,
540
+ 'country_code': 'us',
541
+ "wait_browser": "networkidle0",
542
+ 'screenshot_full_page': True,
543
+ "json_response":True,
544
+ "render_js": True,
545
+ 'js_scenario': {
546
+ "instructions": [
547
+ {"wait": 1000},
548
+ {"infinite_scroll": # Scroll the page until the end
549
+ {
550
+ "max_count": 0, # Maximum number of scroll, 0 for infinite
551
+ "delay": 1000, # Delay between each scroll, in ms
552
+ # "end_click": {"selector": "#button_id"} # (optional) Click on a button when the end of the page is reached, usually a "load more" button
553
+ }
554
+ }
555
+ ]
556
+ }
557
+ }
558
+ )
559
+ if response2.ok:
560
+ res2_json = response2.json()
561
+ return self.crop_images(res2_json['screenshot'], uid, idx, save_path)
562
+ return []
563
+
564
+ def get_render_link_urlbox(
565
+ self,
566
+ args: dict
567
+ ) -> str:
568
+ """
569
+ Generate a render link for URLBox API.
570
+ """
571
+ URLBOX_API_SECRET = self.secret_dict.get('URLBOX_API_SECRET', None)
572
+ URLBOX_API_KEY = self.secret_dict.get('URLBOX_API_KEY', None)
573
+ queryString = urlencode(args, True)
574
+ hmacToken = hmac.new(str.encode(URLBOX_API_SECRET), str.encode(queryString), sha256)
575
+ token = hmacToken.hexdigest().rstrip('\n')
576
+ return "https://api.urlbox.com/v1/%s/%s/png?%s" % (URLBOX_API_KEY, token, queryString)
577
+
578
+ def screenshot_webpage_urlbox(
579
+ self,
580
+ url: str,
581
+ uid: str,
582
+ idx: str,
583
+ width: int=1920,
584
+ height: int=1080,
585
+ format: str="png",
586
+ full_page: bool=True,
587
+ save_path: str="/tmp",
588
+ full_page_mode: str="stitch",
589
+ click_accept: bool=True,
590
+ press_escape: bool=True,
591
+ block_ads: bool=True,
592
+ hide_cookie_banners: bool=True,
593
+ delay: Union[str, int]="5000",
594
+ scroll_delay: Union[str, int]="200",
595
+ scroll_increment: Union[str, int]="200",
596
+ wait_until: str="requestsfinished",
597
+ engine_version: str="stable",
598
+ user_agent: str="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
599
+ tries: int=3
600
+ ) -> list:
601
+ os.makedirs(save_path, exist_ok=True)
602
+ """
603
+ Take a screenshot of a webpage using the URLBox API.
604
+ """
605
+ d = {
606
+ "format": format,
607
+ "url": url,
608
+ "width": width,
609
+ "height": height,
610
+ "full_page": full_page,
611
+ "full_page_mode": full_page_mode,
612
+ "click_accept": click_accept,
613
+ "press_escape": press_escape,
614
+ "block_ads": block_ads,
615
+ "hide_cookie_banners": hide_cookie_banners,
616
+ "delay": delay,
617
+ "wait_until": wait_until,
618
+ "scroll_delay": scroll_delay,
619
+ "engine_version": engine_version,
620
+ "scroll_increment": scroll_increment,
621
+ "user_agent": user_agent
622
+ }
623
+ for _ in range(tries):
624
+ render_link = self.get_render_link_urlbox(d)
625
+ response = requests.get(render_link)
626
+
627
+ if response.ok:
628
+ return self.crop_images(response.content, uid, idx, save_path)
629
+ return []
630
+
631
+ def get_bytes_or_b64decoded(
632
+ self,
633
+ data: Union[bytes, str]
634
+ ) -> bytes:
635
+ """
636
+ Convert base64-encoded string to bytes or return bytes as is.
637
+ """
638
+ if isinstance(data, bytes):
639
+ return data
640
+ elif isinstance(data, str):
641
+ return b64decode(data) # Let it raise if it's not valid base64 or not ASCII
642
+ else:
643
+ raise TypeError("Input must be bytes or base64-encoded string.")
644
+
645
+ def crop_images(
646
+ self,
647
+ img_data: bytes,
648
+ uid: str,
649
+ idx: str,
650
+ save_path: str,
651
+ max_h: int = 2048,
652
+ max_w: int = 2048,
653
+ screenshots = [],
654
+ page_length: int = 10,
655
+ ) -> list:
656
+ """
657
+ Crop images from the screenshot data.
658
+ """
659
+ img_data = self.get_bytes_or_b64decoded(img_data)
660
+ image_np = np.frombuffer(img_data, dtype=np.uint8)
661
+ image_cv2 = cv2.imdecode(image_np, cv2.IMREAD_COLOR)
662
+ curr_h = max_h
663
+ prev_h = 0
664
+ im_h = image_cv2.shape[0]
665
+ c = 1
666
+ b = False
667
+ for _ in range(page_length):
668
+ im_cropped = image_cv2[prev_h:curr_h, :max_w]
669
+ if im_cropped.size:
670
+ im_cropped_shape = im_cropped.shape
671
+ print("im_cropped.shape:", im_cropped.shape)
672
+ fn = f"{uid}-{idx}-{c}.png"
673
+ sp = f"{save_path}/{fn}"
674
+ screenshots.append({"save_path": sp, "filename": fn, "viewport": {"width": im_cropped_shape[1], "height": im_cropped_shape[0]}})
675
+ print("save_path:", sp)
676
+ cv2.imwrite(sp, im_cropped)
677
+ else:
678
+ b = True
679
+ if b: return screenshots
680
+ elif curr_h+max_h > im_h:
681
+ prev_h = curr_h
682
+ curr_h = im_h
683
+ b = True
684
+ else:
685
+ prev_h = curr_h
686
+ curr_h += max_h
687
+ c += 1
688
+ return screenshots
689
+
690
+ def ts_semantic_search(
691
+ self,
692
+ query: str,
693
+ collection_name: str = None,
694
+ distance_threshold: float = 0.8,
695
+ limit: int = 5,
696
+ filter_by: str = None,
697
+ query_by: list = ['contents', 'embedding'],
698
+ exclude_fields: list = ['embedding'],
699
+ query_by_weights: list = [2, 1],
700
+ top_k: int = 200,
701
+ prioritize_exact_match: bool = True,
702
+ num_typos: int=0,
703
+ prefix = False,
704
+ **kwargs: Any
705
+ ) -> list:
706
+ """
707
+ Perform a semantic search using Typesense.
708
+ """
709
+ if not collection_name:
710
+ collection_name = self.secret_dict.get('TS_COLLECTION_NAME', None)
711
+
712
+ search_params = {
713
+ # 'query_by': ",".join(query_by),
714
+ # 'query_by_weights': ",".join([str(x) for x in query_by_weights]),
715
+ 'q': query,
716
+ "prefix": prefix,
717
+ "prioritize_exact_match": prioritize_exact_match,
718
+ "limit": limit,
719
+ "num_typos": num_typos,
720
+ }
721
+ if not query_by:
722
+ raise ValueError("query_by must be provided")
723
+ else:
724
+ if len(query_by) == 1:
725
+ search_params['query_by'] = query_by[0]
726
+ else:
727
+ if len(query_by) != len(query_by_weights):
728
+ raise ValueError("query_by and query_by_weights must have the same length")
729
+ else:
730
+ search_params['query_by'] = ",".join(query_by),
731
+ search_params['query_by_weights'] = ",".join([str(x) for x in query_by_weights]),
732
+
733
+ if exclude_fields:
734
+ search_params['exclude_fields'] = ",".join(exclude_fields)
735
+ if distance_threshold:
736
+ search_params['vector_query'] = f"embedding:([], k: {top_k}, distance_threshold: {distance_threshold})"
737
+ if filter_by:
738
+ search_params['filter_by'] = filter_by
739
+ # print(json.dumps(search_params, indent=4, ensure_ascii=False))
740
+ return self.ts_client.collections[collection_name].documents.search(search_params)
741
+
742
+
743
+ def ts_upsert_data(
744
+ self,
745
+ data: dict,
746
+ collection_name: str= None
747
+ ) -> None:
748
+ """
749
+ Upsert data into Typesense collection.
750
+ """
751
+ if not collection_name:
752
+ collection_name = self.secret_dict.get('TS_COLLECTION_NAME', None)
753
+ self.ts_client.collections[collection_name].documents.upsert(data)
754
+
755
+ def research(
756
+ self,
757
+ user_input: str,
758
+ url: str = None,
759
+ instructions: str = None,
760
+ score_thresh: int = 7,
761
+ **kwargs
762
+ ):
763
+ if url is None:
764
+ resp = self.google_search_scrapingbee(user_input)
765
+ resp_json = resp.json()
766
+
767
+ for res in resp_json['organic_results']:
768
+ # res = resp_json['organic_results'][0]
769
+ desc = res['description']
770
+ title = res['title']
771
+ domain = res['domain']
772
+
773
+ webpage_desc = f'{title} - {domain}\n\n{desc}'
774
+
775
+ messages = [
776
+ {
777
+ "role": "system",
778
+ "content": f'''Compare the following two texts and determine their similarity. The first text is the user input, and the second text is a webpage description. Analyze whether the user input conveys a meaning, topic, or key ideas similar to the webpage description, even if the wording is different. Provide a similarity score from 0 to 10, with 10 being identical and 0 being completely unrelated. Additionally, highlight key matching themes, topics, or phrases.
779
+ User input: {user_input}
780
+ Webpage Description: {webpage_desc}'''
781
+ }
782
+ ]
783
+
784
+ text_similarity_tool_name = "text_similarity_tool"
785
+ text_similarity_schema = [
786
+ {
787
+ "type": "function",
788
+ "function": {
789
+ "name": text_similarity_tool_name,
790
+ "strict": True,
791
+ "description": "Compares two texts to determine their similarity. The first text is user input, and the second is a webpage description. The function analyzes whether the user input conveys a meaning, topic, or key ideas similar to the webpage description, even if the wording differs. It returns a similarity score from 0 to 10, where 10 means identical and 0 means completely unrelated.",
792
+ "parameters": {
793
+ "type": "object",
794
+ "properties": {
795
+ "score": {
796
+ "type": "number",
797
+ "description": "A similarity score between 0 and 10, where 10 means the texts are identical in meaning, and 0 means they are completely unrelated.",
798
+ },
799
+ "reason": {
800
+ "type": "string",
801
+ "description": "A brief explanation of why the given similarity score was assigned, highlighting key matching themes, topics, or differences.",
802
+ }
803
+ },
804
+ "required": ["score", "reason"],
805
+ "additionalProperties": False
806
+ },
807
+ }
808
+ }
809
+ ]
810
+
811
+ retval = self.tools_call_gpt(
812
+ messages=messages,
813
+ tool_choice={"type": "function", "function": {"name": text_similarity_tool_name}},
814
+ tools=text_similarity_schema,
815
+ )
816
+
817
+ arguments = json.loads(retval.choices[0].message.tool_calls[0].function.arguments)
818
+ score = arguments['score']
819
+ reason = arguments['reason']
820
+
821
+ print("score:", score)
822
+ print("reason:", reason)
823
+ print("domain:", domain)
824
+ print("title:", title)
825
+ print("desc:", desc)
826
+ print()
827
+ print("*"*100)
828
+ print()
829
+
830
+ if score >= score_thresh:
831
+ url = res['url']
832
+ break
833
+ if url:
834
+ soup = self.scrape_requests(url)
835
+ if not soup:
836
+ soup = self.scrape_scrapingbee_sdk(url)
837
+ clean_website_text = self.clean_text(soup.text)
838
+ # print(clean_website_text)
839
+ if instructions:
840
+ content = f"Follow these instructions: '{instructions}'. Ensure the output meets the specified format or requirements while maintaining accuracy and clarity. If relevant data is unavailable, provide a reasonable alternative or explanation.\n\nScraped Website's Data:{clean_website_text}"
841
+ else:
842
+ content = f"Summarize the key content of the following scraped webpage based on the user's input. Focus on the aspects most relevant to what the user is asking for, ensuring the summary aligns with their query. Highlight the main topics, key points, and any important details related to the user's intent. If applicable, emphasize notable features, services, or unique aspects relevant to their request.\n\nUser Input: {user_input}\n\nScraped Website Text:\n{clean_website_text}"
843
+ print(content)
844
+ messages2 = [
845
+ {
846
+ "role": "system",
847
+ "content": content
848
+ }
849
+ ]
850
+ retval2 = self.chat_completion_gpt(messages2)
851
+ result = retval2.choices[0].message.content
852
+ result += f"\n\nSource(s): {url}"
853
+ else:
854
+ messages2 = [
855
+ {
856
+ "role": "system",
857
+ "content": f"Tell the user that there's no available data based on their search input: '{user_input}'. Instead, provide an insightful response related to their query using general knowledge, context, and reasoning to help address their question as best as possible."
858
+ }
859
+ ]
860
+ retval2 = self.chat_completion_gpt(messages2)
861
+ result = retval2.choices[0].message.content
862
+ return {
863
+ "type": "string",
864
+ "value": result
865
+ }
@@ -0,0 +1,34 @@
1
+ Metadata-Version: 2.4
2
+ Name: aicmo
3
+ Version: 0.0.1
4
+ Summary: A package for using aicmo functions and tools
5
+ Author: Jayr Castro
6
+ Author-email: jayrcastro.py@gmail.com
7
+ Keywords: aicmo
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Operating System :: Unix
12
+ Classifier: Operating System :: MacOS :: MacOS X
13
+ Classifier: Operating System :: Microsoft :: Windows
14
+ Description-Content-Type: text/markdown
15
+ Requires-Dist: openai==1.75.0
16
+ Requires-Dist: scrapingbee==2.0.1
17
+ Requires-Dist: requests==2.32.3
18
+ Requires-Dist: boto3==1.37.37
19
+ Requires-Dist: tiktoken==0.9.0
20
+ Requires-Dist: opencv-python-headless==4.11.0.86
21
+ Requires-Dist: beautifulsoup4==4.13.4
22
+ Requires-Dist: numpy==2.2.4
23
+ Requires-Dist: python-dotenv==1.1.0
24
+ Requires-Dist: typesense==1.0.3
25
+ Dynamic: author
26
+ Dynamic: author-email
27
+ Dynamic: classifier
28
+ Dynamic: description
29
+ Dynamic: description-content-type
30
+ Dynamic: keywords
31
+ Dynamic: requires-dist
32
+ Dynamic: summary
33
+
34
+ A package for using aicmo functions and tools, includes scraping, openai with an options where you can use it in a serverless application such as AWS Lambda and GCP Cloud Function
@@ -0,0 +1,8 @@
1
+ README.md
2
+ setup.py
3
+ aicmo/__init__.py
4
+ aicmo.egg-info/PKG-INFO
5
+ aicmo.egg-info/SOURCES.txt
6
+ aicmo.egg-info/dependency_links.txt
7
+ aicmo.egg-info/requires.txt
8
+ aicmo.egg-info/top_level.txt
@@ -0,0 +1,10 @@
1
+ openai==1.75.0
2
+ scrapingbee==2.0.1
3
+ requests==2.32.3
4
+ boto3==1.37.37
5
+ tiktoken==0.9.0
6
+ opencv-python-headless==4.11.0.86
7
+ beautifulsoup4==4.13.4
8
+ numpy==2.2.4
9
+ python-dotenv==1.1.0
10
+ typesense==1.0.3
@@ -0,0 +1 @@
1
+ aicmo
aicmo-0.0.1/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
aicmo-0.0.1/setup.py ADDED
@@ -0,0 +1,38 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ VERSION = '0.0.1'
4
+
5
+ # Setting up
6
+ setup(
7
+ name="aicmo",
8
+ version=VERSION,
9
+ author="Jayr Castro",
10
+ author_email="jayrcastro.py@gmail.com",
11
+ description="A package for using aicmo functions and tools",
12
+ long_description_content_type="text/markdown",
13
+ long_description='A package for using aicmo functions and tools, includes scraping, openai with an options where you can use it in a serverless application such as AWS Lambda and GCP Cloud Function',
14
+ packages=find_packages(),
15
+ install_requires=[
16
+ "openai==1.75.0",
17
+ "scrapingbee==2.0.1",
18
+ "requests==2.32.3",
19
+ "boto3==1.37.37",
20
+ "tiktoken==0.9.0",
21
+ "opencv-python-headless==4.11.0.86",
22
+ "beautifulsoup4==4.13.4",
23
+ "numpy==2.2.4",
24
+ "python-dotenv==1.1.0",
25
+ "typesense==1.0.3"
26
+ ],
27
+ keywords=[
28
+ 'aicmo'
29
+ ],
30
+ classifiers=[
31
+ "Development Status :: 4 - Beta",
32
+ "Intended Audience :: Developers",
33
+ "Programming Language :: Python :: 3.12",
34
+ "Operating System :: Unix",
35
+ "Operating System :: MacOS :: MacOS X",
36
+ "Operating System :: Microsoft :: Windows",
37
+ ]
38
+ )