PyPI - groundx - Versions diffs - 2.2.8__py3-none-any.whl → 2.2.9__py3-none-any.whl - Mend

groundx 2.2.8py3-none-any.whl → 2.2.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

groundx/core/client_wrapper.py +1 -1
groundx/csv_splitter.py +64 -0
groundx/ingest.py +121 -77
{groundx-2.2.8.dist-info → groundx-2.2.9.dist-info}/METADATA +1 -1
{groundx-2.2.8.dist-info → groundx-2.2.9.dist-info}/RECORD +7 -6
{groundx-2.2.8.dist-info → groundx-2.2.9.dist-info}/LICENSE +0 -0
{groundx-2.2.8.dist-info → groundx-2.2.9.dist-info}/WHEEL +0 -0

groundx/core/client_wrapper.py CHANGED Viewed

@@ -16,7 +16,7 @@ class BaseClientWrapper:
         headers: typing.Dict[str, str] = {
             "X-Fern-Language": "Python",
             "X-Fern-SDK-Name": "groundx",
-            "X-Fern-SDK-Version": "2.2.8",
+            "X-Fern-SDK-Version": "2.2.9",
         }
         headers["X-API-Key"] = self.api_key
         return headers

groundx/csv_splitter.py ADDED Viewed

@@ -0,0 +1,64 @@
+import csv, math, os, tempfile, typing
+from pathlib import Path
+class CSVSplitter:
+    def __init__(self, filepath, delimiter=','):
+        self.filepath = filepath
+        self.delimiter = delimiter
+        self.filename = os.path.basename(filepath)
+        self.file_size = os.path.getsize(filepath)
+        self.rows_count = self.get_row_count()
+    def get_row_count(self):
+        with open(self.filepath, "r", newline="", encoding="utf-8") as csvfile:
+            return sum(1 for _ in csvfile) - 1
+    def determine_splits(self):
+        row_mod = int(self.rows_count / 1000) + 1
+        file_mod = int(self.file_size / 1024 / 1024) + 1
+        return max(row_mod, file_mod)
+    def split(self):
+        splits = self.determine_splits()
+        if splits < 2:
+            return [Path(self.filepath)]
+        rows_per_file = math.ceil(self.rows_count / splits)
+        split_files: typing.List[Path] = []
+        with open(self.filepath, "r", newline="", encoding="utf-8") as csvfile:
+            reader = csv.reader(csvfile, delimiter=self.delimiter)
+            headers = next(reader)
+            temp_dir = tempfile.mkdtemp()
+            current_file_number = 1
+            current_row = 0
+            current_writer = None
+            current_output_file = None
+            for row in reader:
+                if current_row % rows_per_file == 0:
+                    if current_output_file:
+                        current_output_file.close()
+                    output_file_path = os.path.join(
+                        temp_dir,
+                        f"{os.path.splitext(self.filename)[0]}_{current_file_number}.csv",
+                    )
+                    split_files.append(Path(output_file_path))
+                    current_output_file = open(
+                        output_file_path, "w", newline="", encoding="utf-8"
+                    )
+                    current_writer = csv.writer(current_output_file, delimiter=self.delimiter)
+                    current_writer.writerow(headers)
+                    current_file_number += 1
+                current_writer.writerow(row)
+                current_row += 1
+            if current_output_file:
+                current_output_file.close()
+        return split_files

groundx/ingest.py CHANGED Viewed

@@ -5,8 +5,8 @@ from urllib.parse import urlparse, urlunparse
 from .client import GroundXBase, AsyncGroundXBase
 from .core.request_options import RequestOptions
+from .csv_splitter import CSVSplitter
 from .types.document import Document
-from .types.document_type import DocumentType
 from .types.ingest_remote_document import IngestRemoteDocument
 from .types.ingest_response import IngestResponse
@@ -38,10 +38,18 @@ MIME_TO_DOCUMENT_TYPE = {v: k for k, v in DOCUMENT_TYPE_TO_MIME.items()}
 ALLOWED_SUFFIXES = {f".{k}": v for k, v in DOCUMENT_TYPE_TO_MIME.items()}
+CSV_SPLITS = {
+    ".csv": True,
+}
+TSV_SPLITS = {
+    ".tsv": True,
+}
 SUFFIX_ALIASES = {
-    ".jpeg": ".jpg",
-    ".heic": ".heif",
-    ".tif": ".tiff",
+    ".jpeg": "jpg",
+    ".heic": "heif",
+    ".tif": "tiff",
+    ".md": "txt",
 }
 MAX_BATCH_SIZE = 50
@@ -115,6 +123,18 @@ def prep_documents(
     return remote_documents, local_documents
+def split_doc(file):
+    if file.is_file() and (
+        file.suffix.lower() in ALLOWED_SUFFIXES
+        or file.suffix.lower() in SUFFIX_ALIASES
+    ):
+        if file.suffix.lower() in CSV_SPLITS:
+            return CSVSplitter(filepath=file).split()
+        elif file.suffix.lower() in TSV_SPLITS:
+            return CSVSplitter(filepath=file, delimiter='\t').split()
+        return [file]
+    return []
 class GroundX(GroundXBase):
     def ingest(
         self,
@@ -163,41 +183,39 @@ class GroundX(GroundXBase):
         """
         remote_documents, local_documents = prep_documents(documents)
-        if local_documents and remote_documents:
-            raise ValueError("Documents must all be either local or remote, not a mix.")
-        if len(remote_documents) > 0:
-            if len(remote_documents) > MAX_BATCH_SIZE:
-                raise ValueError("You have sent too many documents in this request")
-            return self.documents.ingest_remote(
-                documents=remote_documents,
-                request_options=request_options,
-            )
-        if len(local_documents) > MAX_BATCH_SIZE:
+        if len(remote_documents) + len(local_documents) > MAX_BATCH_SIZE:
             raise ValueError("You have sent too many documents in this request")
-        if len(local_documents) == 0:
+        if len(remote_documents) + len(local_documents) == 0:
             raise ValueError("No valid documents were provided")
-        docs: typing.List[IngestRemoteDocument] = []
         for d in local_documents:
-            url = self._upload_file(upload_api, Path(os.path.expanduser(d.file_path)))
-            docs.append(
-                IngestRemoteDocument(
-                    bucket_id=d.bucket_id,
-                    file_name=d.file_name,
-                    file_type=d.file_type,
-                    process_level=d.process_level,
-                    search_data=d.search_data,
-                    source_url=url,
+            splits = split_doc(Path(os.path.expanduser(d.file_path)))
+            for sd in splits:
+                url = self._upload_file(upload_api, sd)
+                ft = d.file_type
+                if sd.suffix.lower() in SUFFIX_ALIASES:
+                    ft = SUFFIX_ALIASES[sd.suffix.lower()]
+                fn = sd.name
+                if len(splits) == 1 and d.file_name:
+                    fn = d.file_name
+                remote_documents.append(
+                    IngestRemoteDocument(
+                        bucket_id=d.bucket_id,
+                        file_name=fn,
+                        file_type=ft,
+                        process_level=d.process_level,
+                        search_data=d.search_data,
+                        source_url=url,
+                    )
                 )
-            )
         return self.documents.ingest_remote(
-            documents=docs,
+            documents=remote_documents,
             request_options=request_options,
         )
@@ -252,14 +270,10 @@ class GroundX(GroundXBase):
         def load_directory_files(directory: str) -> typing.List[Path]:
             dir_path = Path(directory)
-            matched_files = [
-                file
-                for file in dir_path.rglob("*")
-                if file.is_file() and (
-                    file.suffix.lower() in ALLOWED_SUFFIXES
-                    or file.suffix.lower() in SUFFIX_ALIASES
-                )
-            ]
+            matched_files: typing.List[Path] = []
+            for file in dir_path.rglob("*"):
+                for sd in split_doc(file):
+                    matched_files.append(sd)
             return matched_files
@@ -301,6 +315,8 @@ class GroundX(GroundXBase):
     ):
         file_name = os.path.basename(file_path)
         file_extension = os.path.splitext(file_name)[1][1:].lower()
+        if f".{file_extension}" in SUFFIX_ALIASES:
+            file_extension = SUFFIX_ALIASES[f".{file_extension}"]
         presigned_info = get_presigned_url(endpoint, file_name, file_extension)
@@ -343,12 +359,23 @@ class GroundX(GroundXBase):
         progress = len(batch)
         for file in batch:
             url = self._upload_file(upload_api, file)
-            docs.append(
-                Document(
-                    bucket_id=bucket_id,
-                    file_path=url,
-                ),
-            )
+            if file.suffix.lower() in SUFFIX_ALIASES:
+                docs.append(
+                    Document(
+                        bucket_id=bucket_id,
+                        file_name=file.name,
+                        file_path=url,
+                        file_type=SUFFIX_ALIASES[file.suffix.lower()],
+                    ),
+                )
+            else:
+                docs.append(
+                    Document(
+                        bucket_id=bucket_id,
+                        file_name=file.name,
+                        file_path=url,
+                    ),
+                )
             pbar.update(0.25)
             progress -= 0.25
@@ -364,11 +391,28 @@ class GroundX(GroundXBase):
                 time.sleep(3)
                 ingest = self.documents.get_processing_status_by_id(ingest.ingest.process_id)
-                if ingest.ingest.progress and ingest.ingest.progress.processing:
-                    for doc in ingest.ingest.progress.processing.documents:
-                        if doc.status == "complete" and doc.document_id not in completed_files:
-                            pbar.update(0.75)
-                            progress -= 0.75
+                if ingest.ingest.progress:
+                    if ingest.ingest.progress.processing and ingest.ingest.progress.processing.documents:
+                        for doc in ingest.ingest.progress.processing.documents:
+                            if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
+                                pbar.update(0.75)
+                                progress -= 0.75
+                    if ingest.ingest.progress.complete and ingest.ingest.progress.complete.documents:
+                        for doc in ingest.ingest.progress.complete.documents:
+                            if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
+                                pbar.update(0.75)
+                                progress -= 0.75
+                    if ingest.ingest.progress.cancelled and ingest.ingest.progress.cancelled.documents:
+                        for doc in ingest.ingest.progress.cancelled.documents:
+                            if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
+                                pbar.update(0.75)
+                                progress -= 0.75
+                    if ingest.ingest.progress.errors and ingest.ingest.progress.errors.documents:
+                        for doc in ingest.ingest.progress.errors.documents:
+                            if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
+                                pbar.update(0.75)
+                                progress -= 0.75
             if ingest.ingest.status in ["error", "cancelled"]:
                 raise ValueError(f"Ingest failed with status: {ingest.ingest.status}")
@@ -431,41 +475,39 @@ class AsyncGroundX(AsyncGroundXBase):
         """
         remote_documents, local_documents = prep_documents(documents)
-        if local_documents and remote_documents:
-            raise ValueError("Documents must all be either local or remote, not a mix.")
-        if len(remote_documents) > 0:
-            if len(remote_documents) > MAX_BATCH_SIZE:
-                raise ValueError("You have sent too many documents in this request")
-            return await self.documents.ingest_remote(
-                documents=remote_documents,
-                request_options=request_options,
-            )
-        if len(local_documents) > MAX_BATCH_SIZE:
+        if len(remote_documents) + len(local_documents) > MAX_BATCH_SIZE:
             raise ValueError("You have sent too many documents in this request")
-        if len(local_documents) == 0:
+        if len(remote_documents) + len(local_documents) == 0:
             raise ValueError("No valid documents were provided")
-        docs: typing.List[IngestRemoteDocument] = []
         for d in local_documents:
-            url = self._upload_file(upload_api, Path(os.path.expanduser(d.file_path)))
-            docs.append(
-                IngestRemoteDocument(
-                    bucket_id=d.bucket_id,
-                    file_name=d.file_name,
-                    file_type=d.file_type,
-                    process_level=d.process_level,
-                    search_data=d.search_data,
-                    source_url=url,
+            splits = split_doc(Path(os.path.expanduser(d.file_path)))
+            for sd in splits:
+                url = self._upload_file(upload_api, sd)
+                ft = d.file_type
+                if sd.suffix.lower() in SUFFIX_ALIASES:
+                    ft = SUFFIX_ALIASES[sd.suffix.lower()]
+                fn = sd.name
+                if len(splits) == 1 and d.file_name:
+                    fn = d.file_name
+                remote_documents.append(
+                    IngestRemoteDocument(
+                        bucket_id=d.bucket_id,
+                        file_name=fn,
+                        file_type=ft,
+                        process_level=d.process_level,
+                        search_data=d.search_data,
+                        source_url=url,
+                    )
                 )
-            )
         return await self.documents.ingest_remote(
-            documents=docs,
+            documents=remote_documents,
             request_options=request_options,
         )
@@ -476,6 +518,8 @@ class AsyncGroundX(AsyncGroundXBase):
     ):
         file_name = os.path.basename(file_path)
         file_extension = os.path.splitext(file_name)[1][1:].lower()
+        if f".{file_extension}" in SUFFIX_ALIASES:
+            file_extension = SUFFIX_ALIASES[f".{file_extension}"]
         presigned_info = get_presigned_url(endpoint, file_name, file_extension)

{groundx-2.2.8.dist-info → groundx-2.2.9.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: groundx
-Version: 2.2.8
+Version: 2.2.9
 Summary:
 License: MIT
 Requires-Python: >=3.8,<4.0

{groundx-2.2.8.dist-info → groundx-2.2.9.dist-info}/RECORD RENAMED Viewed

@@ -4,7 +4,7 @@ groundx/buckets/client.py,sha256=4jlc9vfIult1mMJ4FZW4_KFJybZPStZt1FUplIgrxbU,239
 groundx/client.py,sha256=dIW9OyrMyfC1N7HSxRrHh0w_8rJ8osNUOPdYD6ueQ6g,6515
 groundx/core/__init__.py,sha256=SQ85PF84B9MuKnBwHNHWemSGuy-g_515gFYNFhvEE0I,1438
 groundx/core/api_error.py,sha256=RE8LELok2QCjABadECTvtDp7qejA1VmINCh6TbqPwSE,426
-groundx/core/client_wrapper.py,sha256=DckEncGF_W3v8kguKAXWcn2um2B9k-nkWvKmH0HoyGQ,1802
+groundx/core/client_wrapper.py,sha256=D6uZpUYxYzmgxNNCTN7quiFvNlBQmPOyLstnrXfcJcs,1802
 groundx/core/datetime_utils.py,sha256=nBys2IsYrhPdszxGKCNRPSOCwa-5DWOHG95FB8G9PKo,1047
 groundx/core/file.py,sha256=d4NNbX8XvXP32z8KpK2Xovv33nFfruIrpz0QWxlgpZk,2663
 groundx/core/http_client.py,sha256=Z77OIxIbL4OAB2IDqjRq_sYa5yNYAWfmdhdCSSvh6Y4,19552
@@ -14,6 +14,7 @@ groundx/core/query_encoder.py,sha256=ekulqNd0j8TgD7ox-Qbz7liqX8-KP9blvT9DsRCenYM
 groundx/core/remove_none_from_dict.py,sha256=EU9SGgYidWq7SexuJbNs4-PZ-5Bl3Vppd864mS6vQZw,342
 groundx/core/request_options.py,sha256=h0QUNCFVdCW_7GclVySCAY2w4NhtXVBUCmHgmzaxpcg,1681
 groundx/core/serialization.py,sha256=D9h_t-RQON3-CHWs1C4ESY9B-Yd5d-l5lnTLb_X896g,9601
+groundx/csv_splitter.py,sha256=6HGXdDpwBX_IJaCbla1WuirJERBTvjLzBf9OBtwGFWU,2254
 groundx/customer/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
 groundx/customer/client.py,sha256=C_JANeDewRD1Kg-q7LPxdiOSWbYSTOiYlBYZLRYPI44,3467
 groundx/documents/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
@@ -26,7 +27,7 @@ groundx/groups/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
 groundx/groups/client.py,sha256=bytQRh9m7e4vIuYHb7dD1kCTQZvyBxedCqGnmmLqrsI,35237
 groundx/health/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
 groundx/health/client.py,sha256=fcTa21RWPyBuT77PQ0EncC6rBaW_DrYlRvudy9-0H58,7545
-groundx/ingest.py,sha256=mQB__GQmIDP6W5jGhmjORXh2o6HHTseHiahZz2Es-tM,16119
+groundx/ingest.py,sha256=LtnUGcgtE1MNYL3PGFrzPqRMnLeOxr-fVsZ3fmTAUKI,18294
 groundx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 groundx/search/__init__.py,sha256=RagVzjShP33mDg9o4N3kGzV0egL1RYNjCpXPE8VzMYE,145
 groundx/search/client.py,sha256=zrrqFy0HowDUYPsMU4nfvDV2RgmkEQ4E8WYNktu3xcs,18684
@@ -81,7 +82,7 @@ groundx/types/subscription_detail.py,sha256=WNfUw2EMVECIvNYcV2s51zZ6T3Utc4zYXw63
 groundx/types/subscription_detail_meters.py,sha256=lBa8-1QlMVHjr5RLGqhiTKnD1KMM0AAHTWvz9TVtG8w,830
 groundx/types/website_source.py,sha256=3WeRCiilNKKBTfhwgjo3jbcVI3vLTeM-KxI6dVzpg9o,1578
 groundx/version.py,sha256=1yVogKaq260fQfckM2RYN2144SEw0QROsZW8ICtkG4U,74
-groundx-2.2.8.dist-info/LICENSE,sha256=dFE6nY1bHnSn6NqmdlghlU1gQqLqYNphrceGVehSa7o,1065
-groundx-2.2.8.dist-info/METADATA,sha256=Q67rLMlsO72-NH2qvK8Bw4ciSTDS5C4LNoPiBk8UWyI,5173
-groundx-2.2.8.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
-groundx-2.2.8.dist-info/RECORD,,
+groundx-2.2.9.dist-info/LICENSE,sha256=dFE6nY1bHnSn6NqmdlghlU1gQqLqYNphrceGVehSa7o,1065
+groundx-2.2.9.dist-info/METADATA,sha256=1BWmC2-Lx8AT1vEY0juSK4ugjxyn9W5ndJy0bomFEwQ,5173
+groundx-2.2.9.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
+groundx-2.2.9.dist-info/RECORD,,

{groundx-2.2.8.dist-info → groundx-2.2.9.dist-info}/LICENSE RENAMED Viewed

File without changes

{groundx-2.2.8.dist-info → groundx-2.2.9.dist-info}/WHEEL RENAMED Viewed

File without changes

groundx 2.2.8__py3-none-any.whl → 2.2.9__py3-none-any.whl

groundx 2.2.8py3-none-any.whl → 2.2.9py3-none-any.whl