semantio 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- semantio/agent.py +218 -313
- semantio/knowledge_base/document_loader.py +176 -46
- semantio/tools/web_browser.py +344 -238
- {semantio-0.0.6.dist-info → semantio-0.0.8.dist-info}/METADATA +4 -2
- {semantio-0.0.6.dist-info → semantio-0.0.8.dist-info}/RECORD +9 -9
- {semantio-0.0.6.dist-info → semantio-0.0.8.dist-info}/LICENSE +0 -0
- {semantio-0.0.6.dist-info → semantio-0.0.8.dist-info}/WHEEL +0 -0
- {semantio-0.0.6.dist-info → semantio-0.0.8.dist-info}/entry_points.txt +0 -0
- {semantio-0.0.6.dist-info → semantio-0.0.8.dist-info}/top_level.txt +0 -0
@@ -1,61 +1,191 @@
|
|
1
|
-
|
1
|
+
import os
|
2
|
+
import json
|
3
|
+
import csv
|
4
|
+
import re
|
2
5
|
from pathlib import Path
|
6
|
+
from typing import List, Dict, Any
|
7
|
+
from io import BytesIO
|
3
8
|
|
4
|
-
|
5
|
-
|
6
|
-
A class to load documents from various sources (e.g., files, URLs) into the knowledge base.
|
7
|
-
"""
|
9
|
+
import requests
|
10
|
+
from bs4 import BeautifulSoup
|
8
11
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
12
|
+
# Optional: Import pandas for XLSX support and PyPDF2 for PDF support
|
13
|
+
try:
|
14
|
+
import pandas as pd
|
15
|
+
except ImportError:
|
16
|
+
pd = None
|
14
17
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
+
try:
|
19
|
+
from PyPDF2 import PdfReader
|
20
|
+
except ImportError:
|
21
|
+
PdfReader = None
|
18
22
|
|
19
|
-
Args:
|
20
|
-
file_path (str): The path to the file.
|
21
23
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
24
|
+
def flatten_json(data: Any, parent_key: str = "", separator: str = "_") -> List[Dict[str, Any]]:
|
25
|
+
"""
|
26
|
+
Recursively flatten a JSON structure.
|
27
|
+
For each key-value pair, add an entry mapping key->value.
|
28
|
+
Additionally, if the value is a string, add an entry mapping the value to its flattened key.
|
29
|
+
"""
|
30
|
+
items = []
|
31
|
+
if isinstance(data, dict):
|
32
|
+
for key, value in data.items():
|
33
|
+
new_key = f"{parent_key}{separator}{key}" if parent_key else key
|
34
|
+
if isinstance(value, (dict, list)):
|
35
|
+
items.extend(flatten_json(value, new_key, separator))
|
36
|
+
else:
|
37
|
+
items.append({new_key: value})
|
38
|
+
if isinstance(value, str):
|
39
|
+
items.append({value: new_key})
|
40
|
+
elif isinstance(data, list):
|
41
|
+
for index, item in enumerate(data):
|
42
|
+
new_key = f"{parent_key}{separator}{index}" if parent_key else str(index)
|
43
|
+
if isinstance(item, (dict, list)):
|
44
|
+
items.extend(flatten_json(item, new_key, separator))
|
45
|
+
else:
|
46
|
+
items.append({new_key: item})
|
47
|
+
if isinstance(item, str):
|
48
|
+
items.append({item: new_key})
|
49
|
+
return items
|
28
50
|
|
29
|
-
# Example: Load a JSON file
|
30
|
-
if file_path.suffix == ".json":
|
31
|
-
import json
|
32
|
-
with open(file_path, "r") as f:
|
33
|
-
return json.load(f)
|
34
|
-
# Example: Load a text file
|
35
|
-
elif file_path.suffix == ".txt":
|
36
|
-
with open(file_path, "r") as f:
|
37
|
-
return [{"text": f.read()}]
|
38
|
-
else:
|
39
|
-
raise ValueError(f"Unsupported file type: {file_path.suffix}")
|
40
51
|
|
41
|
-
|
52
|
+
class DocumentLoader:
|
53
|
+
"""
|
54
|
+
A dynamic document loader that supports multiple source types:
|
55
|
+
|
56
|
+
- Local files: CSV, TXT, JSON, XLSX, PDF
|
57
|
+
- URL sources: HTML websites (text extraction), JSON APIs, PDF URLs
|
58
|
+
- YouTube links: Extracts transcripts using youtube_transcript_api
|
59
|
+
|
60
|
+
For JSON sources, if flatten is True (default), the returned document is a dictionary with two keys:
|
61
|
+
"original": the raw JSON data,
|
62
|
+
"flattened": a list of flattened key/value pairs (including reverse mappings).
|
63
|
+
"""
|
64
|
+
def load(self, source: str, flatten: bool = True) -> List[Dict[str, Any]]:
|
42
65
|
"""
|
43
|
-
Load documents from
|
66
|
+
Load documents from the given source.
|
67
|
+
If source starts with "http", treat it as a URL; otherwise, as a local file.
|
68
|
+
"""
|
69
|
+
if source.startswith("http"):
|
70
|
+
return self.load_from_url(source, flatten=flatten)
|
71
|
+
else:
|
72
|
+
return self.load_from_file(source, flatten=flatten)
|
44
73
|
|
45
|
-
|
46
|
-
|
74
|
+
def load_from_file(self, file_path: str, flatten: bool = True) -> List[Dict[str, Any]]:
|
75
|
+
path = Path(file_path)
|
76
|
+
if not path.exists():
|
77
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
78
|
+
ext = path.suffix.lower()
|
79
|
+
if ext == ".json":
|
80
|
+
with open(path, "r", encoding="utf-8") as f:
|
81
|
+
data = json.load(f)
|
82
|
+
if flatten:
|
83
|
+
return [{"original": data, "flattened": flatten_json(data)}]
|
84
|
+
else:
|
85
|
+
return data if isinstance(data, list) else [data]
|
86
|
+
elif ext == ".txt":
|
87
|
+
with open(path, "r", encoding="utf-8") as f:
|
88
|
+
content = f.read()
|
89
|
+
return [{"text": content}]
|
90
|
+
elif ext == ".csv":
|
91
|
+
with open(path, "r", encoding="utf-8") as f:
|
92
|
+
reader = csv.DictReader(f)
|
93
|
+
return [row for row in reader]
|
94
|
+
elif ext == ".xlsx":
|
95
|
+
if pd is None:
|
96
|
+
raise ImportError("pandas is required to load XLSX files")
|
97
|
+
df = pd.read_excel(path)
|
98
|
+
return df.to_dict(orient="records")
|
99
|
+
elif ext == ".pdf":
|
100
|
+
if PdfReader is None:
|
101
|
+
raise ImportError("PyPDF2 is required to load PDF files")
|
102
|
+
reader = PdfReader(str(path))
|
103
|
+
content = ""
|
104
|
+
for page in reader.pages:
|
105
|
+
content += page.extract_text() or ""
|
106
|
+
return [{"text": content}]
|
107
|
+
else:
|
108
|
+
raise ValueError(f"Unsupported file type: {ext}")
|
47
109
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
import requests
|
110
|
+
def load_from_url(self, url: str, flatten: bool = True) -> List[Dict[str, Any]]:
|
111
|
+
if "youtube.com" in url or "youtu.be" in url:
|
112
|
+
return self._load_youtube(url)
|
52
113
|
response = requests.get(url)
|
53
114
|
if response.status_code != 200:
|
54
115
|
raise ValueError(f"Failed to fetch data from URL: {url}")
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
116
|
+
content_type = response.headers.get("Content-Type", "").lower()
|
117
|
+
if "application/json" in content_type:
|
118
|
+
data = response.json()
|
119
|
+
if flatten:
|
120
|
+
return [{"original": data, "flattened": flatten_json(data)}]
|
121
|
+
else:
|
122
|
+
return data if isinstance(data, list) else [data]
|
123
|
+
elif "text/html" in content_type:
|
124
|
+
# First, try with requests + BeautifulSoup.
|
125
|
+
soup = BeautifulSoup(response.text, "html.parser")
|
126
|
+
text = soup.get_text(separator="\n").strip()
|
127
|
+
# If the text seems too short (less than 50 words), assume content is loaded via JavaScript.
|
128
|
+
if len(text.split()) < 50:
|
129
|
+
try:
|
130
|
+
text = self._fetch_with_headless_browser(url)
|
131
|
+
except Exception as e:
|
132
|
+
# If headless browser fails, log and fallback to the short text.
|
133
|
+
print(f"Headless fetch failed: {e}")
|
134
|
+
return [{"text": text}]
|
135
|
+
elif "application/pdf" in content_type:
|
136
|
+
if PdfReader is None:
|
137
|
+
raise ImportError("PyPDF2 is required to load PDF files")
|
138
|
+
pdf_file = BytesIO(response.content)
|
139
|
+
reader = PdfReader(pdf_file)
|
140
|
+
text = ""
|
141
|
+
for page in reader.pages:
|
142
|
+
text += page.extract_text() or ""
|
143
|
+
return [{"text": text}]
|
60
144
|
else:
|
61
|
-
return [{"text": response.text}]
|
145
|
+
return [{"text": response.text}]
|
146
|
+
|
147
|
+
def _fetch_with_headless_browser(self, url: str) -> str:
|
148
|
+
"""
|
149
|
+
Use a headless browser (Playwright) to fetch fully rendered content.
|
150
|
+
"""
|
151
|
+
try:
|
152
|
+
from playwright.sync_api import sync_playwright
|
153
|
+
except ImportError:
|
154
|
+
raise ImportError("playwright is required for JS-rendered pages. Install it with 'pip install playwright' and run 'playwright install'.")
|
155
|
+
with sync_playwright() as p:
|
156
|
+
browser = p.chromium.launch(headless=True)
|
157
|
+
page = browser.new_page()
|
158
|
+
page.goto(url, wait_until="networkidle")
|
159
|
+
html = page.content()
|
160
|
+
browser.close()
|
161
|
+
soup = BeautifulSoup(html, "html.parser")
|
162
|
+
text = soup.get_text(separator="\n").strip()
|
163
|
+
return text
|
164
|
+
|
165
|
+
def _load_youtube(self, url: str) -> List[Dict[str, Any]]:
|
166
|
+
try:
|
167
|
+
from youtube_transcript_api import YouTubeTranscriptApi
|
168
|
+
except ImportError:
|
169
|
+
raise ImportError("youtube_transcript_api is required to load YouTube transcripts")
|
170
|
+
|
171
|
+
video_id = None
|
172
|
+
patterns = [r"v=([^&]+)", r"youtu\.be/([^?&]+)"]
|
173
|
+
for pattern in patterns:
|
174
|
+
match = re.search(pattern, url)
|
175
|
+
if match:
|
176
|
+
video_id = match.group(1)
|
177
|
+
break
|
178
|
+
if not video_id:
|
179
|
+
raise ValueError("Could not extract video ID from URL")
|
180
|
+
|
181
|
+
# Define a prioritized list of language codes to try
|
182
|
+
preferred_languages = ["en", "hi", "es", "fr", "de", "ru"]
|
183
|
+
|
184
|
+
try:
|
185
|
+
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=preferred_languages)
|
186
|
+
text = " ".join(segment["text"] for segment in transcript)
|
187
|
+
return [{"text": text}]
|
188
|
+
except Exception as e:
|
189
|
+
# Return a fallback document indicating transcript retrieval failed
|
190
|
+
return [{"text": f"Transcript not available for video {url}: {str(e)}"}]
|
191
|
+
|