atomicshop 2.18.18__py3-none-any.whl → 2.18.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of atomicshop might be problematic. Click here for more details.
- atomicshop/__init__.py +1 -1
- atomicshop/web_apis/google_custom_search.py +8 -1
- atomicshop/web_apis/google_llm.py +43 -15
- atomicshop/wrappers/playwrightw/scenarios.py +81 -11
- {atomicshop-2.18.18.dist-info → atomicshop-2.18.19.dist-info}/METADATA +1 -1
- {atomicshop-2.18.18.dist-info → atomicshop-2.18.19.dist-info}/RECORD +9 -9
- {atomicshop-2.18.18.dist-info → atomicshop-2.18.19.dist-info}/LICENSE.txt +0 -0
- {atomicshop-2.18.18.dist-info → atomicshop-2.18.19.dist-info}/WHEEL +0 -0
- {atomicshop-2.18.18.dist-info → atomicshop-2.18.19.dist-info}/top_level.txt +0 -0
atomicshop/__init__.py
CHANGED
|
@@ -25,7 +25,14 @@ def search_google(
|
|
|
25
25
|
|
|
26
26
|
try:
|
|
27
27
|
service = build("customsearch", "v1", developerKey=api_key)
|
|
28
|
-
result = service.cse().list(
|
|
28
|
+
result = service.cse().list(
|
|
29
|
+
q=query,
|
|
30
|
+
cx=search_engine_id,
|
|
31
|
+
# gl="us", # Country code
|
|
32
|
+
# lr="lang_en", # Language restriction
|
|
33
|
+
# safe="off", # Safe search off
|
|
34
|
+
# dateRestrict="m1" # Restrict results to the last month
|
|
35
|
+
).execute()
|
|
29
36
|
items = result.get('items', [])
|
|
30
37
|
links = [item['link'] for item in items if 'link' in item]
|
|
31
38
|
return links, error
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import os
|
|
2
|
+
from typing import Literal
|
|
2
3
|
|
|
3
4
|
import google.generativeai as genai
|
|
4
5
|
|
|
5
6
|
from . import google_custom_search
|
|
6
7
|
from ..wrappers.playwrightw import scenarios
|
|
8
|
+
from .. import urls
|
|
7
9
|
|
|
8
10
|
|
|
9
11
|
class GoogleCustomSearchError(Exception):
|
|
@@ -41,8 +43,16 @@ class GoogleLLM:
|
|
|
41
43
|
|
|
42
44
|
def get_answer_online(
|
|
43
45
|
self,
|
|
44
|
-
|
|
45
|
-
|
|
46
|
+
search_query_or_url: str,
|
|
47
|
+
text_fetch_method: Literal[
|
|
48
|
+
'playwright_text',
|
|
49
|
+
'js_text',
|
|
50
|
+
'playwright_html',
|
|
51
|
+
'js_html',
|
|
52
|
+
'playwright_copypaste'
|
|
53
|
+
],
|
|
54
|
+
llm_query: str,
|
|
55
|
+
llm_post_instructions: str,
|
|
46
56
|
number_of_top_links: int = 2,
|
|
47
57
|
number_of_characters_per_link: int = 15000,
|
|
48
58
|
temperature: float = 0,
|
|
@@ -52,8 +62,17 @@ class GoogleLLM:
|
|
|
52
62
|
"""
|
|
53
63
|
Function to get the answer to a question by searching Google Custom Console API and processing the content using Gemini API.
|
|
54
64
|
|
|
55
|
-
:param
|
|
56
|
-
|
|
65
|
+
:param search_query_or_url: string, is checked if it is a URL or a search query.
|
|
66
|
+
Search query: the search query to search on Google Custom Search.
|
|
67
|
+
URL: the URL to fetch content from without using Google Custom Search.
|
|
68
|
+
:param text_fetch_method: string, the method to fetch text from the URL.
|
|
69
|
+
playwright_text: uses native Playwright to fetch text from the URL.
|
|
70
|
+
js_text: uses Playwright and JavaScript evaluation to fetch text from the URL.
|
|
71
|
+
playwright_html: uses native Playwright to fetch HTML from the URL and then parse it to text using beautiful soup.
|
|
72
|
+
js_html: uses Playwright and JavaScript evaluation to fetch HTML from the URL and then parse it to text using beautiful soup.
|
|
73
|
+
playwright_copypaste: uses native Playwright to fetch text from the URL by copying and pasting the text from rendered page using clipboard.
|
|
74
|
+
:param llm_query: string, the question to ask the LLM about the text content that is returned from the search query or the URL.
|
|
75
|
+
:param llm_post_instructions: string, additional instructions to provide to the LLM on the answer it provided after the llm_query.
|
|
57
76
|
:param number_of_top_links: integer, the number of top links to fetch content from.
|
|
58
77
|
:param number_of_characters_per_link: integer, the number of characters to fetch from each link.
|
|
59
78
|
:param temperature: float, the temperature parameter for the LLM.
|
|
@@ -63,22 +82,31 @@ class GoogleLLM:
|
|
|
63
82
|
:return: string, the answer by LLM to the question.
|
|
64
83
|
"""
|
|
65
84
|
|
|
66
|
-
#
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
85
|
+
# Check if the search query is a URL.
|
|
86
|
+
if urls.is_valid_url(search_query_or_url):
|
|
87
|
+
# Fetch content from the URL
|
|
88
|
+
contents = scenarios.fetch_urls_content_in_threads(
|
|
89
|
+
urls=[search_query_or_url], number_of_characters_per_link=number_of_characters_per_link,
|
|
90
|
+
text_fetch_method=text_fetch_method)
|
|
91
|
+
# If not a URL, Search Google for links related to the query
|
|
92
|
+
else:
|
|
93
|
+
links, search_error = google_custom_search.search_google(
|
|
94
|
+
query=search_query_or_url, api_key=self.search_api_key, search_engine_id=self.search_engine_id)
|
|
95
|
+
|
|
96
|
+
if search_error:
|
|
97
|
+
raise GoogleCustomSearchError(f"Error occurred when searching Google: {search_error}")
|
|
98
|
+
|
|
99
|
+
# Get only the first X links to not overload the LLM.
|
|
100
|
+
contents = scenarios.fetch_urls_content_in_threads(
|
|
101
|
+
urls=links[:number_of_top_links], number_of_characters_per_link=number_of_characters_per_link,
|
|
102
|
+
text_fetch_method=text_fetch_method)
|
|
75
103
|
|
|
76
104
|
combined_content = ""
|
|
77
105
|
for content in contents:
|
|
78
106
|
combined_content += f'{content}\n\n\n\n================================================================'
|
|
79
107
|
|
|
80
|
-
final_question = (f'Answer this question: {
|
|
81
|
-
f'Follow these instructions: {
|
|
108
|
+
final_question = (f'Answer this question: {llm_query}\n\n'
|
|
109
|
+
f'Follow these instructions: {llm_post_instructions}\n\n'
|
|
82
110
|
f'Based on these data contents:\n\n'
|
|
83
111
|
f'{combined_content}')
|
|
84
112
|
|
|
@@ -4,9 +4,11 @@ For example: run playwright, navigate to URL, get text from a locator.
|
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
6
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
7
|
+
from typing import Literal
|
|
7
8
|
|
|
8
9
|
from playwright.sync_api import sync_playwright
|
|
9
10
|
from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
|
|
11
|
+
from bs4 import BeautifulSoup
|
|
10
12
|
|
|
11
13
|
from . import engine, base, combos
|
|
12
14
|
from ...basics import threads, multiprocesses
|
|
@@ -148,7 +150,14 @@ def _get_page_content_in_process(
|
|
|
148
150
|
|
|
149
151
|
def fetch_urls_content_in_threads(
|
|
150
152
|
urls: list[str],
|
|
151
|
-
number_of_characters_per_link: int
|
|
153
|
+
number_of_characters_per_link: int,
|
|
154
|
+
text_fetch_method: Literal[
|
|
155
|
+
'playwright_text',
|
|
156
|
+
'js_text',
|
|
157
|
+
'playwright_html',
|
|
158
|
+
'js_html',
|
|
159
|
+
'playwright_copypaste'
|
|
160
|
+
]
|
|
152
161
|
) -> list[str]:
|
|
153
162
|
""" The function to fetch all URLs concurrently using threads """
|
|
154
163
|
contents = []
|
|
@@ -156,7 +165,7 @@ def fetch_urls_content_in_threads(
|
|
|
156
165
|
# Use ThreadPoolExecutor to run multiple threads
|
|
157
166
|
with ThreadPoolExecutor() as executor:
|
|
158
167
|
# Submit tasks for each URL
|
|
159
|
-
future_to_url = {executor.submit(_fetch_content, url, number_of_characters_per_link): url for url in urls}
|
|
168
|
+
future_to_url = {executor.submit(_fetch_content, url, number_of_characters_per_link, text_fetch_method): url for url in urls}
|
|
160
169
|
|
|
161
170
|
# Collect results as they complete
|
|
162
171
|
for future in as_completed(future_to_url):
|
|
@@ -172,23 +181,62 @@ def fetch_urls_content_in_threads(
|
|
|
172
181
|
|
|
173
182
|
def fetch_urls_content(
|
|
174
183
|
urls: list[str],
|
|
175
|
-
number_of_characters_per_link: int
|
|
184
|
+
number_of_characters_per_link: int,
|
|
185
|
+
text_fetch_method: Literal[
|
|
186
|
+
'playwright_text',
|
|
187
|
+
'js_text',
|
|
188
|
+
'playwright_html',
|
|
189
|
+
'js_html',
|
|
190
|
+
'playwright_copypaste'
|
|
191
|
+
],
|
|
176
192
|
) -> list[str]:
|
|
177
193
|
""" The function to fetch all URLs not concurrently without using threads """
|
|
178
194
|
contents = []
|
|
179
195
|
|
|
180
196
|
for url in urls:
|
|
181
|
-
data = _fetch_content(url, number_of_characters_per_link)
|
|
197
|
+
data = _fetch_content(url, number_of_characters_per_link, text_fetch_method)
|
|
182
198
|
contents.append(data)
|
|
183
199
|
|
|
184
200
|
return contents
|
|
185
201
|
|
|
186
202
|
|
|
187
|
-
def _fetch_content(
|
|
203
|
+
def _fetch_content(
|
|
204
|
+
url,
|
|
205
|
+
number_of_characters_per_link,
|
|
206
|
+
text_fetch_method: Literal[
|
|
207
|
+
'playwright_text',
|
|
208
|
+
'js_text',
|
|
209
|
+
'playwright_html',
|
|
210
|
+
'js_html',
|
|
211
|
+
'playwright_copypaste'
|
|
212
|
+
],
|
|
213
|
+
headless: bool = True):
|
|
188
214
|
""" Function to fetch content from a single URL using the synchronous Playwright API """
|
|
215
|
+
|
|
189
216
|
with sync_playwright() as p:
|
|
190
|
-
browser = p.chromium.launch(headless=headless)
|
|
191
|
-
|
|
217
|
+
browser = p.chromium.launch(headless=headless) # Set headless=True if you don't want to see the browser
|
|
218
|
+
|
|
219
|
+
if text_fetch_method == "playwright_copypaste":
|
|
220
|
+
context = browser.new_context(permissions=["clipboard-read", "clipboard-write"])
|
|
221
|
+
else:
|
|
222
|
+
context = browser.new_context()
|
|
223
|
+
|
|
224
|
+
page = context.new_page()
|
|
225
|
+
|
|
226
|
+
# from playwright_stealth import stealth_sync
|
|
227
|
+
# stealth_sync(page)
|
|
228
|
+
|
|
229
|
+
# # Block specific script by URL or partial URL match
|
|
230
|
+
# def block_script(route):
|
|
231
|
+
# if "custom.js" in route.request.url:
|
|
232
|
+
# print(f"Blocking: {route.request.url}")
|
|
233
|
+
# route.abort() # Block the request
|
|
234
|
+
# else:
|
|
235
|
+
# route.continue_() # Allow other requests
|
|
236
|
+
#
|
|
237
|
+
# # Intercept and handle network requests
|
|
238
|
+
# page.route("**/*", block_script)
|
|
239
|
+
|
|
192
240
|
page.goto(url)
|
|
193
241
|
|
|
194
242
|
# Wait for the page to load using all possible methods, since there is no specific method
|
|
@@ -207,8 +255,32 @@ def _fetch_content(url, number_of_characters_per_link, headless: bool = True):
|
|
|
207
255
|
except PlaywrightTimeoutError:
|
|
208
256
|
break
|
|
209
257
|
|
|
210
|
-
|
|
211
|
-
|
|
258
|
+
if text_fetch_method == "playwright_text":
|
|
259
|
+
text_content = page.inner_text('body')
|
|
260
|
+
elif text_fetch_method == "js_text":
|
|
261
|
+
# Use JavaScript to extract only the visible text from the page
|
|
262
|
+
text_content: str = page.evaluate("document.body.innerText")
|
|
263
|
+
elif text_fetch_method == "playwright_html":
|
|
264
|
+
# Get the full HTML content of the page
|
|
265
|
+
html = page.content()
|
|
266
|
+
# Parse the HTML using BeautifulSoup and extract the text
|
|
267
|
+
soup = BeautifulSoup(html, 'html.parser')
|
|
268
|
+
text_content = soup.get_text()
|
|
269
|
+
elif text_fetch_method == "js_html":
|
|
270
|
+
# Use JavaScript to extract the full text from the page
|
|
271
|
+
html = page.evaluate('document.documentElement.outerHTML')
|
|
272
|
+
# Parse the HTML using BeautifulSoup and extract the text
|
|
273
|
+
soup = BeautifulSoup(html, 'html.parser')
|
|
274
|
+
text_content = soup.get_text()
|
|
275
|
+
elif text_fetch_method == "playwright_copypaste":
|
|
276
|
+
# Focus the page and simulate Ctrl+A and Ctrl+C
|
|
277
|
+
page.keyboard.press("Control+a") # Select all text
|
|
278
|
+
page.keyboard.press("Control+c") # Copy text to clipboard
|
|
279
|
+
# Retrieve copied text from the clipboard
|
|
280
|
+
text_content = page.evaluate("navigator.clipboard.readText()")
|
|
281
|
+
else:
|
|
282
|
+
raise ValueError(f"Invalid text_fetch_method: {text_fetch_method}")
|
|
283
|
+
|
|
212
284
|
# text = page.evaluate('document.body.textContent')
|
|
213
285
|
# text = page.eval_on_selector('body', 'element => element.innerText')
|
|
214
286
|
# text = page.eval_on_selector('body', 'element => element.textContent')
|
|
@@ -217,8 +289,6 @@ def _fetch_content(url, number_of_characters_per_link, headless: bool = True):
|
|
|
217
289
|
|
|
218
290
|
# text = page.evaluate('document.documentElement.innerText')
|
|
219
291
|
# text = page.inner_text(':root')
|
|
220
|
-
# html = page.content()
|
|
221
|
-
# html = page.evaluate('document.documentElement.outerHTML')
|
|
222
292
|
|
|
223
293
|
browser.close()
|
|
224
294
|
# Return only the first X characters of the text content to not overload the LLM.
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
atomicshop/__init__.py,sha256=
|
|
1
|
+
atomicshop/__init__.py,sha256=VnFf6mmsMm3Gl5pv70NpCxRaUMo5qq2TDFjlnEeDvJs,124
|
|
2
2
|
atomicshop/_basics_temp.py,sha256=6cu2dd6r2dLrd1BRNcVDKTHlsHs_26Gpw8QS6v32lQ0,3699
|
|
3
3
|
atomicshop/_create_pdf_demo.py,sha256=Yi-PGZuMg0RKvQmLqVeLIZYadqEZwUm-4A9JxBl_vYA,3713
|
|
4
4
|
atomicshop/_patch_import.py,sha256=ENp55sKVJ0e6-4lBvZnpz9PQCt3Otbur7F6aXDlyje4,6334
|
|
@@ -177,8 +177,8 @@ atomicshop/startup/win/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3
|
|
|
177
177
|
atomicshop/startup/win/startup_folder.py,sha256=2RZEyF-Mf8eWPlt_-OaoGKKnMs6YhELEzJZ376EI0E0,1891
|
|
178
178
|
atomicshop/startup/win/task_scheduler.py,sha256=qALe-8sfthYxsdCViH2r8OsH3x-WauDqteg5RzElPdk,4348
|
|
179
179
|
atomicshop/web_apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
180
|
-
atomicshop/web_apis/google_custom_search.py,sha256=
|
|
181
|
-
atomicshop/web_apis/google_llm.py,sha256=
|
|
180
|
+
atomicshop/web_apis/google_custom_search.py,sha256=R1BnUmBFWZIWkfizSRWoSYoZTdPEjLJ28F_sS2g1jGQ,1558
|
|
181
|
+
atomicshop/web_apis/google_llm.py,sha256=X_sG3leUvskPCPryN6YszDFih_X2Ne0OSMA3UbDMKIg,6741
|
|
182
182
|
atomicshop/wrappers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
183
183
|
atomicshop/wrappers/_process_wrapper_curl.py,sha256=XkZZXYl7D0Q6UfdWqy-18AvpU0yVp9i2BVD2qRcXlkk,841
|
|
184
184
|
atomicshop/wrappers/_process_wrapper_tar.py,sha256=WUMZFKNrlG4nJP9tWZ51W7BR1j_pIjsjgyAStmWjRGs,655
|
|
@@ -275,7 +275,7 @@ atomicshop/wrappers/playwrightw/javascript.py,sha256=_bW7CAtm0Y8IHYrAalg5HpPFnk6
|
|
|
275
275
|
atomicshop/wrappers/playwrightw/keyboard.py,sha256=zN3YddGO-qUkn6C0CRVFejP4cTuaUwXLDNFhFREjERY,422
|
|
276
276
|
atomicshop/wrappers/playwrightw/locators.py,sha256=6wsLywZxDuii7mwv-zQsRbqQC8r7j96Bma5b5_7ZoVo,2411
|
|
277
277
|
atomicshop/wrappers/playwrightw/mouse.py,sha256=-2FZbQtjgH7tdXWld6ZPGqlKFUdf5in0ujN0hewxa50,656
|
|
278
|
-
atomicshop/wrappers/playwrightw/scenarios.py,sha256=
|
|
278
|
+
atomicshop/wrappers/playwrightw/scenarios.py,sha256=Xvl1jUmQhd4l0MmOUgQKfgGleblIyE-qC3wuoyx16tU,11531
|
|
279
279
|
atomicshop/wrappers/playwrightw/waits.py,sha256=PBFdz_PoM7Fo7O8hLqMrxNPzBEYgPoXwZceFFCGGeu8,7182
|
|
280
280
|
atomicshop/wrappers/psutilw/cpus.py,sha256=w6LPBMINqS-T_X8vzdYkLS2Wzuve28Ydp_GafTCngrc,236
|
|
281
281
|
atomicshop/wrappers/psutilw/disks.py,sha256=3ZSVoommKH1TWo37j_83frB-NqXF4Nf5q5mBCX8G4jE,9221
|
|
@@ -320,8 +320,8 @@ atomicshop/wrappers/socketw/statistics_csv.py,sha256=fgMzDXI0cybwUEqAxprRmY3lqbh
|
|
|
320
320
|
atomicshop/wrappers/winregw/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
321
321
|
atomicshop/wrappers/winregw/winreg_installed_software.py,sha256=Qzmyktvob1qp6Tjk2DjLfAqr_yXV0sgWzdMW_9kwNjY,2345
|
|
322
322
|
atomicshop/wrappers/winregw/winreg_network.py,sha256=AENV88H1qDidrcpyM9OwEZxX5svfi-Jb4N6FkS1xtqA,8851
|
|
323
|
-
atomicshop-2.18.
|
|
324
|
-
atomicshop-2.18.
|
|
325
|
-
atomicshop-2.18.
|
|
326
|
-
atomicshop-2.18.
|
|
327
|
-
atomicshop-2.18.
|
|
323
|
+
atomicshop-2.18.19.dist-info/LICENSE.txt,sha256=lLU7EYycfYcK2NR_1gfnhnRC8b8ccOTElACYplgZN88,1094
|
|
324
|
+
atomicshop-2.18.19.dist-info/METADATA,sha256=laKTJjYAM6iOz-IeezdJ7sYHK8LFfJX1Zuc3Ru-Ktkg,10577
|
|
325
|
+
atomicshop-2.18.19.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
326
|
+
atomicshop-2.18.19.dist-info/top_level.txt,sha256=EgKJB-7xcrAPeqTRF2laD_Np2gNGYkJkd4OyXqpJphA,11
|
|
327
|
+
atomicshop-2.18.19.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|