groundx 2.2.7__tar.gz → 2.2.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {groundx-2.2.7 → groundx-2.2.9}/PKG-INFO +1 -1
- {groundx-2.2.7 → groundx-2.2.9}/pyproject.toml +1 -1
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/core/client_wrapper.py +1 -1
- groundx-2.2.9/src/groundx/csv_splitter.py +64 -0
- groundx-2.2.9/src/groundx/ingest.py +550 -0
- groundx-2.2.7/src/groundx/ingest.py +0 -531
- {groundx-2.2.7 → groundx-2.2.9}/LICENSE +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/README.md +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/__init__.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/buckets/__init__.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/buckets/client.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/client.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/core/__init__.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/core/api_error.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/core/datetime_utils.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/core/file.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/core/http_client.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/core/jsonable_encoder.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/core/pydantic_utilities.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/core/query_encoder.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/core/remove_none_from_dict.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/core/request_options.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/core/serialization.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/customer/__init__.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/customer/client.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/documents/__init__.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/documents/client.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/environment.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/errors/__init__.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/errors/bad_request_error.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/errors/unauthorized_error.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/groups/__init__.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/groups/client.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/health/__init__.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/health/client.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/py.typed +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/search/__init__.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/search/client.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/search/types/__init__.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/search/types/search_content_request_id.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/__init__.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/bounding_box_detail.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/bucket_detail.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/bucket_list_response.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/bucket_response.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/bucket_update_detail.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/bucket_update_response.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/customer_detail.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/customer_response.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/document.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/document_detail.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/document_list_response.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/document_local_ingest_request.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/document_lookup_response.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/document_response.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/document_type.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/group_detail.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/group_list_response.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/group_response.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/health_response.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/health_response_health.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/health_service.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/health_service_status.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/ingest_local_document.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/ingest_local_document_metadata.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/ingest_remote_document.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/ingest_response.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/ingest_response_ingest.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/message_response.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/meter_detail.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/process_level.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/process_status_response.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/process_status_response_ingest.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/process_status_response_ingest_progress.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/process_status_response_ingest_progress_cancelled.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/process_status_response_ingest_progress_complete.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/process_status_response_ingest_progress_errors.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/process_status_response_ingest_progress_processing.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/processes_status_response.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/processing_status.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/search_response.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/search_response_search.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/search_result_item.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/sort.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/sort_order.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/subscription_detail.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/subscription_detail_meters.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/types/website_source.py +0 -0
- {groundx-2.2.7 → groundx-2.2.9}/src/groundx/version.py +0 -0
@@ -0,0 +1,64 @@
|
|
1
|
+
import csv, math, os, tempfile, typing
|
2
|
+
from pathlib import Path
|
3
|
+
|
4
|
+
|
5
|
+
class CSVSplitter:
|
6
|
+
def __init__(self, filepath, delimiter=','):
|
7
|
+
self.filepath = filepath
|
8
|
+
self.delimiter = delimiter
|
9
|
+
self.filename = os.path.basename(filepath)
|
10
|
+
self.file_size = os.path.getsize(filepath)
|
11
|
+
self.rows_count = self.get_row_count()
|
12
|
+
|
13
|
+
def get_row_count(self):
|
14
|
+
with open(self.filepath, "r", newline="", encoding="utf-8") as csvfile:
|
15
|
+
return sum(1 for _ in csvfile) - 1
|
16
|
+
|
17
|
+
def determine_splits(self):
|
18
|
+
row_mod = int(self.rows_count / 1000) + 1
|
19
|
+
file_mod = int(self.file_size / 1024 / 1024) + 1
|
20
|
+
|
21
|
+
return max(row_mod, file_mod)
|
22
|
+
|
23
|
+
def split(self):
|
24
|
+
splits = self.determine_splits()
|
25
|
+
if splits < 2:
|
26
|
+
return [Path(self.filepath)]
|
27
|
+
|
28
|
+
rows_per_file = math.ceil(self.rows_count / splits)
|
29
|
+
|
30
|
+
split_files: typing.List[Path] = []
|
31
|
+
with open(self.filepath, "r", newline="", encoding="utf-8") as csvfile:
|
32
|
+
reader = csv.reader(csvfile, delimiter=self.delimiter)
|
33
|
+
headers = next(reader)
|
34
|
+
|
35
|
+
temp_dir = tempfile.mkdtemp()
|
36
|
+
|
37
|
+
current_file_number = 1
|
38
|
+
current_row = 0
|
39
|
+
current_writer = None
|
40
|
+
current_output_file = None
|
41
|
+
|
42
|
+
for row in reader:
|
43
|
+
if current_row % rows_per_file == 0:
|
44
|
+
if current_output_file:
|
45
|
+
current_output_file.close()
|
46
|
+
output_file_path = os.path.join(
|
47
|
+
temp_dir,
|
48
|
+
f"{os.path.splitext(self.filename)[0]}_{current_file_number}.csv",
|
49
|
+
)
|
50
|
+
split_files.append(Path(output_file_path))
|
51
|
+
current_output_file = open(
|
52
|
+
output_file_path, "w", newline="", encoding="utf-8"
|
53
|
+
)
|
54
|
+
current_writer = csv.writer(current_output_file, delimiter=self.delimiter)
|
55
|
+
current_writer.writerow(headers)
|
56
|
+
current_file_number += 1
|
57
|
+
|
58
|
+
current_writer.writerow(row)
|
59
|
+
current_row += 1
|
60
|
+
|
61
|
+
if current_output_file:
|
62
|
+
current_output_file.close()
|
63
|
+
|
64
|
+
return split_files
|
@@ -0,0 +1,550 @@
|
|
1
|
+
import requests, time, typing, os
|
2
|
+
from pathlib import Path
|
3
|
+
from tqdm import tqdm
|
4
|
+
from urllib.parse import urlparse, urlunparse
|
5
|
+
|
6
|
+
from .client import GroundXBase, AsyncGroundXBase
|
7
|
+
from .core.request_options import RequestOptions
|
8
|
+
from .csv_splitter import CSVSplitter
|
9
|
+
from .types.document import Document
|
10
|
+
from .types.ingest_remote_document import IngestRemoteDocument
|
11
|
+
from .types.ingest_response import IngestResponse
|
12
|
+
|
13
|
+
# this is used as the default value for optional parameters
|
14
|
+
OMIT = typing.cast(typing.Any, ...)
|
15
|
+
|
16
|
+
|
17
|
+
DOCUMENT_TYPE_TO_MIME = {
|
18
|
+
"bmp": "image/bmp",
|
19
|
+
"gif": "image/gif",
|
20
|
+
"heif": "image/heif",
|
21
|
+
"hwp": "application/x-hwp",
|
22
|
+
"ico": "image/vnd.microsoft.icon",
|
23
|
+
"svg": "image/svg",
|
24
|
+
"tiff": "image/tiff",
|
25
|
+
"webp": "image/webp",
|
26
|
+
"txt": "text/plain",
|
27
|
+
"docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
28
|
+
"pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
29
|
+
"xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
30
|
+
"pdf": "application/pdf",
|
31
|
+
"png": "image/png",
|
32
|
+
"jpg": "image/jpeg",
|
33
|
+
"csv": "text/csv",
|
34
|
+
"tsv": "text/tab-separated-values",
|
35
|
+
"json": "application/json",
|
36
|
+
}
|
37
|
+
MIME_TO_DOCUMENT_TYPE = {v: k for k, v in DOCUMENT_TYPE_TO_MIME.items()}
|
38
|
+
|
39
|
+
ALLOWED_SUFFIXES = {f".{k}": v for k, v in DOCUMENT_TYPE_TO_MIME.items()}
|
40
|
+
|
41
|
+
CSV_SPLITS = {
|
42
|
+
".csv": True,
|
43
|
+
}
|
44
|
+
TSV_SPLITS = {
|
45
|
+
".tsv": True,
|
46
|
+
}
|
47
|
+
|
48
|
+
SUFFIX_ALIASES = {
|
49
|
+
".jpeg": "jpg",
|
50
|
+
".heic": "heif",
|
51
|
+
".tif": "tiff",
|
52
|
+
".md": "txt",
|
53
|
+
}
|
54
|
+
|
55
|
+
MAX_BATCH_SIZE = 50
|
56
|
+
MIN_BATCH_SIZE = 1
|
57
|
+
MAX_BATCH_SIZE_BYTES = 50 * 1024 * 1024
|
58
|
+
|
59
|
+
def get_presigned_url(
|
60
|
+
endpoint: str,
|
61
|
+
file_name: str,
|
62
|
+
file_extension: str,
|
63
|
+
) -> typing.Dict[str, typing.Any]:
|
64
|
+
params = {"name": file_name, "type": file_extension}
|
65
|
+
response = requests.get(endpoint, params=params)
|
66
|
+
response.raise_for_status()
|
67
|
+
|
68
|
+
return response.json()
|
69
|
+
|
70
|
+
def strip_query_params(
|
71
|
+
url: str,
|
72
|
+
) -> str:
|
73
|
+
parsed = urlparse(url)
|
74
|
+
clean_url = urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", ""))
|
75
|
+
|
76
|
+
return clean_url
|
77
|
+
|
78
|
+
def prep_documents(
|
79
|
+
documents: typing.Sequence[Document],
|
80
|
+
) -> typing.Tuple[
|
81
|
+
typing.List[IngestRemoteDocument],
|
82
|
+
typing.List[Document],
|
83
|
+
]:
|
84
|
+
"""
|
85
|
+
Process documents and separate them into remote and local documents.
|
86
|
+
"""
|
87
|
+
if not documents:
|
88
|
+
raise ValueError("No documents provided for ingestion.")
|
89
|
+
|
90
|
+
def is_valid_local_path(path: str) -> bool:
|
91
|
+
expanded_path = os.path.expanduser(path)
|
92
|
+
return os.path.exists(expanded_path)
|
93
|
+
|
94
|
+
def is_valid_url(path: str) -> bool:
|
95
|
+
try:
|
96
|
+
result = urlparse(path)
|
97
|
+
return all([result.scheme, result.netloc])
|
98
|
+
except ValueError:
|
99
|
+
return False
|
100
|
+
|
101
|
+
local_documents: typing.List[Document] = []
|
102
|
+
remote_documents: typing.List[IngestRemoteDocument] = []
|
103
|
+
|
104
|
+
for document in documents:
|
105
|
+
if not hasattr(document, "file_path"):
|
106
|
+
raise ValueError("Each document must have a 'file_path' attribute.")
|
107
|
+
|
108
|
+
if is_valid_url(document.file_path):
|
109
|
+
remote_document = IngestRemoteDocument(
|
110
|
+
bucket_id=document.bucket_id,
|
111
|
+
file_name=document.file_name,
|
112
|
+
file_type=document.file_type,
|
113
|
+
process_level=document.process_level,
|
114
|
+
search_data=document.search_data,
|
115
|
+
source_url=document.file_path,
|
116
|
+
)
|
117
|
+
remote_documents.append(remote_document)
|
118
|
+
elif is_valid_local_path(document.file_path):
|
119
|
+
local_documents.append(document)
|
120
|
+
else:
|
121
|
+
raise ValueError(f"Invalid file path: {document.file_path}")
|
122
|
+
|
123
|
+
return remote_documents, local_documents
|
124
|
+
|
125
|
+
|
126
|
+
def split_doc(file):
|
127
|
+
if file.is_file() and (
|
128
|
+
file.suffix.lower() in ALLOWED_SUFFIXES
|
129
|
+
or file.suffix.lower() in SUFFIX_ALIASES
|
130
|
+
):
|
131
|
+
if file.suffix.lower() in CSV_SPLITS:
|
132
|
+
return CSVSplitter(filepath=file).split()
|
133
|
+
elif file.suffix.lower() in TSV_SPLITS:
|
134
|
+
return CSVSplitter(filepath=file, delimiter='\t').split()
|
135
|
+
return [file]
|
136
|
+
return []
|
137
|
+
|
138
|
+
class GroundX(GroundXBase):
|
139
|
+
def ingest(
|
140
|
+
self,
|
141
|
+
*,
|
142
|
+
documents: typing.Sequence[Document],
|
143
|
+
upload_api: typing.Optional[str] = "https://api.eyelevel.ai/upload/file",
|
144
|
+
request_options: typing.Optional[RequestOptions] = None,
|
145
|
+
) -> IngestResponse:
|
146
|
+
"""
|
147
|
+
Ingest local or hosted documents into a GroundX bucket.
|
148
|
+
|
149
|
+
Parameters
|
150
|
+
----------
|
151
|
+
documents : typing.Sequence[Document]
|
152
|
+
|
153
|
+
# an endpoint that accepts 'name' and 'type' query params
|
154
|
+
# and returns a presigned URL in a JSON dictionary with key 'URL'
|
155
|
+
upload_api : typing.Optional[str]
|
156
|
+
|
157
|
+
request_options : typing.Optional[RequestOptions]
|
158
|
+
Request-specific configuration.
|
159
|
+
|
160
|
+
Returns
|
161
|
+
-------
|
162
|
+
IngestResponse
|
163
|
+
Documents successfully uploaded
|
164
|
+
|
165
|
+
Examples
|
166
|
+
--------
|
167
|
+
from groundx import Document, GroundX
|
168
|
+
|
169
|
+
client = GroundX(
|
170
|
+
api_key="YOUR_API_KEY",
|
171
|
+
)
|
172
|
+
|
173
|
+
client.ingest(
|
174
|
+
documents=[
|
175
|
+
Document(
|
176
|
+
bucket_id=1234,
|
177
|
+
file_name="my_file1.txt",
|
178
|
+
file_path="https://my.source.url.com/file1.txt",
|
179
|
+
file_type="txt",
|
180
|
+
)
|
181
|
+
],
|
182
|
+
)
|
183
|
+
"""
|
184
|
+
remote_documents, local_documents = prep_documents(documents)
|
185
|
+
|
186
|
+
if len(remote_documents) + len(local_documents) > MAX_BATCH_SIZE:
|
187
|
+
raise ValueError("You have sent too many documents in this request")
|
188
|
+
|
189
|
+
if len(remote_documents) + len(local_documents) == 0:
|
190
|
+
raise ValueError("No valid documents were provided")
|
191
|
+
|
192
|
+
for d in local_documents:
|
193
|
+
splits = split_doc(Path(os.path.expanduser(d.file_path)))
|
194
|
+
|
195
|
+
for sd in splits:
|
196
|
+
url = self._upload_file(upload_api, sd)
|
197
|
+
|
198
|
+
ft = d.file_type
|
199
|
+
if sd.suffix.lower() in SUFFIX_ALIASES:
|
200
|
+
ft = SUFFIX_ALIASES[sd.suffix.lower()]
|
201
|
+
|
202
|
+
fn = sd.name
|
203
|
+
if len(splits) == 1 and d.file_name:
|
204
|
+
fn = d.file_name
|
205
|
+
|
206
|
+
remote_documents.append(
|
207
|
+
IngestRemoteDocument(
|
208
|
+
bucket_id=d.bucket_id,
|
209
|
+
file_name=fn,
|
210
|
+
file_type=ft,
|
211
|
+
process_level=d.process_level,
|
212
|
+
search_data=d.search_data,
|
213
|
+
source_url=url,
|
214
|
+
)
|
215
|
+
)
|
216
|
+
|
217
|
+
return self.documents.ingest_remote(
|
218
|
+
documents=remote_documents,
|
219
|
+
request_options=request_options,
|
220
|
+
)
|
221
|
+
|
222
|
+
def ingest_directory(
|
223
|
+
self,
|
224
|
+
*,
|
225
|
+
bucket_id: int,
|
226
|
+
path: str,
|
227
|
+
batch_size: typing.Optional[int] = 10,
|
228
|
+
upload_api: typing.Optional[str] = "https://api.eyelevel.ai/upload/file",
|
229
|
+
request_options: typing.Optional[RequestOptions] = None,
|
230
|
+
):
|
231
|
+
"""
|
232
|
+
Ingest documents from a local directory into a GroundX bucket.
|
233
|
+
|
234
|
+
Parameters
|
235
|
+
----------
|
236
|
+
bucket_id : int
|
237
|
+
path : str
|
238
|
+
batch_size : type.Optional[int]
|
239
|
+
|
240
|
+
# an endpoint that accepts 'name' and 'type' query params
|
241
|
+
# and returns a presigned URL in a JSON dictionary with key 'URL'
|
242
|
+
upload_api : typing.Optional[str]
|
243
|
+
|
244
|
+
request_options : typing.Optional[RequestOptions]
|
245
|
+
Request-specific configuration.
|
246
|
+
|
247
|
+
Returns
|
248
|
+
-------
|
249
|
+
IngestResponse
|
250
|
+
Documents successfully uploaded
|
251
|
+
|
252
|
+
Examples
|
253
|
+
--------
|
254
|
+
from groundx import Document, GroundX
|
255
|
+
|
256
|
+
client = GroundX(
|
257
|
+
api_key="YOUR_API_KEY",
|
258
|
+
)
|
259
|
+
|
260
|
+
client.ingest_directory(
|
261
|
+
bucket_id=0,
|
262
|
+
path="/path/to/directory"
|
263
|
+
)
|
264
|
+
"""
|
265
|
+
|
266
|
+
def is_valid_local_directory(path: str) -> bool:
|
267
|
+
expanded_path = os.path.expanduser(path)
|
268
|
+
return os.path.isdir(expanded_path)
|
269
|
+
|
270
|
+
def load_directory_files(directory: str) -> typing.List[Path]:
|
271
|
+
dir_path = Path(directory)
|
272
|
+
|
273
|
+
matched_files: typing.List[Path] = []
|
274
|
+
for file in dir_path.rglob("*"):
|
275
|
+
for sd in split_doc(file):
|
276
|
+
matched_files.append(sd)
|
277
|
+
|
278
|
+
return matched_files
|
279
|
+
|
280
|
+
if bucket_id < 1:
|
281
|
+
raise ValueError(f"Invalid bucket_id: {bucket_id}")
|
282
|
+
|
283
|
+
if is_valid_local_directory(path) is not True:
|
284
|
+
raise ValueError(f"Invalid directory path: {path}")
|
285
|
+
|
286
|
+
files = load_directory_files(path)
|
287
|
+
|
288
|
+
if len(files) < 1:
|
289
|
+
raise ValueError(f"No supported files found in directory: {path}")
|
290
|
+
|
291
|
+
current_batch: typing.List[Path] = []
|
292
|
+
current_batch_size: int = 0
|
293
|
+
|
294
|
+
n = max(MIN_BATCH_SIZE, min(batch_size or MIN_BATCH_SIZE, MAX_BATCH_SIZE))
|
295
|
+
|
296
|
+
with tqdm(total=len(files), desc="Ingesting Files", unit="file") as pbar:
|
297
|
+
for file in files:
|
298
|
+
file_size = file.stat().st_size
|
299
|
+
|
300
|
+
if (current_batch_size + file_size > MAX_BATCH_SIZE_BYTES) or (len(current_batch) >= n):
|
301
|
+
self._upload_file_batch(bucket_id, current_batch, upload_api, request_options, pbar)
|
302
|
+
current_batch = []
|
303
|
+
current_batch_size = 0
|
304
|
+
|
305
|
+
current_batch.append(file)
|
306
|
+
current_batch_size += file_size
|
307
|
+
|
308
|
+
if current_batch:
|
309
|
+
self._upload_file_batch(bucket_id, current_batch, upload_api, request_options, pbar)
|
310
|
+
|
311
|
+
def _upload_file(
|
312
|
+
self,
|
313
|
+
endpoint,
|
314
|
+
file_path,
|
315
|
+
):
|
316
|
+
file_name = os.path.basename(file_path)
|
317
|
+
file_extension = os.path.splitext(file_name)[1][1:].lower()
|
318
|
+
if f".{file_extension}" in SUFFIX_ALIASES:
|
319
|
+
file_extension = SUFFIX_ALIASES[f".{file_extension}"]
|
320
|
+
|
321
|
+
presigned_info = get_presigned_url(endpoint, file_name, file_extension)
|
322
|
+
|
323
|
+
upload_url = presigned_info["URL"]
|
324
|
+
headers = presigned_info.get("Header", {})
|
325
|
+
method = presigned_info.get("Method", "PUT").upper()
|
326
|
+
|
327
|
+
for key, value in headers.items():
|
328
|
+
if isinstance(value, list):
|
329
|
+
headers[key] = value[0]
|
330
|
+
|
331
|
+
try:
|
332
|
+
with open(file_path, "rb") as f:
|
333
|
+
file_data = f.read()
|
334
|
+
except Exception as e:
|
335
|
+
raise ValueError(f"Error reading file {file_path}: {e}")
|
336
|
+
|
337
|
+
if method == "PUT":
|
338
|
+
upload_response = requests.put(upload_url, data=file_data, headers=headers)
|
339
|
+
else:
|
340
|
+
raise ValueError(f"Unsupported HTTP method: {method}")
|
341
|
+
|
342
|
+
if upload_response.status_code not in (200, 201):
|
343
|
+
raise Exception(
|
344
|
+
f"Upload failed: {upload_response.status_code} - {upload_response.text}"
|
345
|
+
)
|
346
|
+
|
347
|
+
return strip_query_params(upload_url)
|
348
|
+
|
349
|
+
def _upload_file_batch(
|
350
|
+
self,
|
351
|
+
bucket_id,
|
352
|
+
batch,
|
353
|
+
upload_api,
|
354
|
+
request_options,
|
355
|
+
pbar,
|
356
|
+
):
|
357
|
+
docs = []
|
358
|
+
|
359
|
+
progress = len(batch)
|
360
|
+
for file in batch:
|
361
|
+
url = self._upload_file(upload_api, file)
|
362
|
+
if file.suffix.lower() in SUFFIX_ALIASES:
|
363
|
+
docs.append(
|
364
|
+
Document(
|
365
|
+
bucket_id=bucket_id,
|
366
|
+
file_name=file.name,
|
367
|
+
file_path=url,
|
368
|
+
file_type=SUFFIX_ALIASES[file.suffix.lower()],
|
369
|
+
),
|
370
|
+
)
|
371
|
+
else:
|
372
|
+
docs.append(
|
373
|
+
Document(
|
374
|
+
bucket_id=bucket_id,
|
375
|
+
file_name=file.name,
|
376
|
+
file_path=url,
|
377
|
+
),
|
378
|
+
)
|
379
|
+
pbar.update(0.25)
|
380
|
+
progress -= 0.25
|
381
|
+
|
382
|
+
if docs:
|
383
|
+
ingest = self.ingest(documents=docs, request_options=request_options)
|
384
|
+
|
385
|
+
completed_files = set()
|
386
|
+
|
387
|
+
while (
|
388
|
+
ingest is not None
|
389
|
+
and ingest.ingest.status not in ["complete", "error", "cancelled"]
|
390
|
+
):
|
391
|
+
time.sleep(3)
|
392
|
+
ingest = self.documents.get_processing_status_by_id(ingest.ingest.process_id)
|
393
|
+
|
394
|
+
if ingest.ingest.progress:
|
395
|
+
if ingest.ingest.progress.processing and ingest.ingest.progress.processing.documents:
|
396
|
+
for doc in ingest.ingest.progress.processing.documents:
|
397
|
+
if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
|
398
|
+
pbar.update(0.75)
|
399
|
+
progress -= 0.75
|
400
|
+
if ingest.ingest.progress.complete and ingest.ingest.progress.complete.documents:
|
401
|
+
for doc in ingest.ingest.progress.complete.documents:
|
402
|
+
if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
|
403
|
+
pbar.update(0.75)
|
404
|
+
progress -= 0.75
|
405
|
+
if ingest.ingest.progress.cancelled and ingest.ingest.progress.cancelled.documents:
|
406
|
+
for doc in ingest.ingest.progress.cancelled.documents:
|
407
|
+
if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
|
408
|
+
pbar.update(0.75)
|
409
|
+
progress -= 0.75
|
410
|
+
if ingest.ingest.progress.errors and ingest.ingest.progress.errors.documents:
|
411
|
+
for doc in ingest.ingest.progress.errors.documents:
|
412
|
+
if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
|
413
|
+
pbar.update(0.75)
|
414
|
+
progress -= 0.75
|
415
|
+
|
416
|
+
|
417
|
+
if ingest.ingest.status in ["error", "cancelled"]:
|
418
|
+
raise ValueError(f"Ingest failed with status: {ingest.ingest.status}")
|
419
|
+
|
420
|
+
if progress > 0:
|
421
|
+
pbar.update(progress)
|
422
|
+
|
423
|
+
|
424
|
+
|
425
|
+
class AsyncGroundX(AsyncGroundXBase):
|
426
|
+
async def ingest(
|
427
|
+
self,
|
428
|
+
*,
|
429
|
+
documents: typing.Sequence[Document],
|
430
|
+
upload_api: str = "https://api.eyelevel.ai/upload/file",
|
431
|
+
request_options: typing.Optional[RequestOptions] = None,
|
432
|
+
) -> IngestResponse:
|
433
|
+
"""
|
434
|
+
Ingest local or hosted documents into a GroundX bucket.
|
435
|
+
|
436
|
+
Parameters
|
437
|
+
----------
|
438
|
+
documents : typing.Sequence[Document]
|
439
|
+
|
440
|
+
# an endpoint that accepts 'name' and 'type' query params
|
441
|
+
# and returns a presigned URL in a JSON dictionary with key 'URL'
|
442
|
+
upload_api : typing.Optional[str]
|
443
|
+
|
444
|
+
request_options : typing.Optional[RequestOptions]
|
445
|
+
Request-specific configuration.
|
446
|
+
|
447
|
+
Returns
|
448
|
+
-------
|
449
|
+
IngestResponse
|
450
|
+
Documents successfully uploaded
|
451
|
+
|
452
|
+
Examples
|
453
|
+
--------
|
454
|
+
import asyncio
|
455
|
+
|
456
|
+
from groundx import AsyncGroundX, Document
|
457
|
+
|
458
|
+
client = AsyncGroundX(
|
459
|
+
api_key="YOUR_API_KEY",
|
460
|
+
)
|
461
|
+
|
462
|
+
async def main() -> None:
|
463
|
+
await client.ingest(
|
464
|
+
documents=[
|
465
|
+
Document(
|
466
|
+
bucket_id=1234,
|
467
|
+
file_name="my_file1.txt",
|
468
|
+
file_path="https://my.source.url.com/file1.txt",
|
469
|
+
file_type="txt",
|
470
|
+
)
|
471
|
+
],
|
472
|
+
)
|
473
|
+
|
474
|
+
asyncio.run(main())
|
475
|
+
"""
|
476
|
+
remote_documents, local_documents = prep_documents(documents)
|
477
|
+
|
478
|
+
if len(remote_documents) + len(local_documents) > MAX_BATCH_SIZE:
|
479
|
+
raise ValueError("You have sent too many documents in this request")
|
480
|
+
|
481
|
+
if len(remote_documents) + len(local_documents) == 0:
|
482
|
+
raise ValueError("No valid documents were provided")
|
483
|
+
|
484
|
+
for d in local_documents:
|
485
|
+
splits = split_doc(Path(os.path.expanduser(d.file_path)))
|
486
|
+
|
487
|
+
for sd in splits:
|
488
|
+
url = self._upload_file(upload_api, sd)
|
489
|
+
|
490
|
+
ft = d.file_type
|
491
|
+
if sd.suffix.lower() in SUFFIX_ALIASES:
|
492
|
+
ft = SUFFIX_ALIASES[sd.suffix.lower()]
|
493
|
+
|
494
|
+
fn = sd.name
|
495
|
+
if len(splits) == 1 and d.file_name:
|
496
|
+
fn = d.file_name
|
497
|
+
|
498
|
+
remote_documents.append(
|
499
|
+
IngestRemoteDocument(
|
500
|
+
bucket_id=d.bucket_id,
|
501
|
+
file_name=fn,
|
502
|
+
file_type=ft,
|
503
|
+
process_level=d.process_level,
|
504
|
+
search_data=d.search_data,
|
505
|
+
source_url=url,
|
506
|
+
)
|
507
|
+
)
|
508
|
+
|
509
|
+
return await self.documents.ingest_remote(
|
510
|
+
documents=remote_documents,
|
511
|
+
request_options=request_options,
|
512
|
+
)
|
513
|
+
|
514
|
+
def _upload_file(
|
515
|
+
self,
|
516
|
+
endpoint,
|
517
|
+
file_path,
|
518
|
+
):
|
519
|
+
file_name = os.path.basename(file_path)
|
520
|
+
file_extension = os.path.splitext(file_name)[1][1:].lower()
|
521
|
+
if f".{file_extension}" in SUFFIX_ALIASES:
|
522
|
+
file_extension = SUFFIX_ALIASES[f".{file_extension}"]
|
523
|
+
|
524
|
+
presigned_info = get_presigned_url(endpoint, file_name, file_extension)
|
525
|
+
|
526
|
+
upload_url = presigned_info["URL"]
|
527
|
+
headers = presigned_info.get("Header", {})
|
528
|
+
method = presigned_info.get("Method", "PUT").upper()
|
529
|
+
|
530
|
+
for key, value in headers.items():
|
531
|
+
if isinstance(value, list):
|
532
|
+
headers[key] = value[0]
|
533
|
+
|
534
|
+
try:
|
535
|
+
with open(file_path, "rb") as f:
|
536
|
+
file_data = f.read()
|
537
|
+
except Exception as e:
|
538
|
+
raise ValueError(f"Error reading file {file_path}: {e}")
|
539
|
+
|
540
|
+
if method == "PUT":
|
541
|
+
upload_response = requests.put(upload_url, data=file_data, headers=headers)
|
542
|
+
else:
|
543
|
+
raise ValueError(f"Unsupported HTTP method: {method}")
|
544
|
+
|
545
|
+
if upload_response.status_code not in (200, 201):
|
546
|
+
raise Exception(
|
547
|
+
f"Upload failed: {upload_response.status_code} - {upload_response.text}"
|
548
|
+
)
|
549
|
+
|
550
|
+
return strip_query_params(upload_url)
|