futurehouse-client 0.4.5.dev10__tar.gz → 0.4.5.dev119__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {futurehouse_client-0.4.5.dev10/src/futurehouse_client.egg-info → futurehouse_client-0.4.5.dev119}/PKG-INFO +2 -1
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/pyproject.toml +1 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/src/futurehouse_client/clients/data_storage_methods.py +345 -29
- futurehouse_client-0.4.5.dev119/src/futurehouse_client/models/client.py +128 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/src/futurehouse_client/models/data_storage_methods.py +47 -2
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/src/futurehouse_client/models/rest.py +8 -1
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/src/futurehouse_client/version.py +3 -3
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119/src/futurehouse_client.egg-info}/PKG-INFO +2 -1
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/src/futurehouse_client.egg-info/requires.txt +1 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/tests/test_data_storage_e2e.py +206 -111
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/tests/test_data_storage_methods.py +156 -32
- futurehouse_client-0.4.5.dev10/src/futurehouse_client/models/client.py +0 -67
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/LICENSE +0 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/README.md +0 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/data_storage.md +0 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/docs/__init__.py +0 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/docs/client_notebook.ipynb +0 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/setup.cfg +0 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/src/futurehouse_client/__init__.py +0 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/src/futurehouse_client/clients/__init__.py +0 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/src/futurehouse_client/clients/job_client.py +0 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/src/futurehouse_client/clients/rest_client.py +0 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/src/futurehouse_client/models/__init__.py +0 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/src/futurehouse_client/models/app.py +0 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/src/futurehouse_client/py.typed +0 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/src/futurehouse_client/utils/__init__.py +0 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/src/futurehouse_client/utils/auth.py +0 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/src/futurehouse_client/utils/general.py +0 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/src/futurehouse_client/utils/module_utils.py +0 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/src/futurehouse_client/utils/monitoring.py +0 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/src/futurehouse_client/utils/world_model_tools.py +0 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/src/futurehouse_client.egg-info/SOURCES.txt +0 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/src/futurehouse_client.egg-info/dependency_links.txt +0 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/src/futurehouse_client.egg-info/top_level.txt +0 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/tests/test_client.py +0 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/tests/test_data/test_file.txt +0 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/tests/test_data/test_information.txt +0 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/tests/test_data/test_manifest.yaml +0 -0
- {futurehouse_client-0.4.5.dev10 → futurehouse_client-0.4.5.dev119}/tests/test_rest.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: futurehouse-client
|
3
|
-
Version: 0.4.5.
|
3
|
+
Version: 0.4.5.dev119
|
4
4
|
Summary: A client for interacting with endpoints of the FutureHouse service.
|
5
5
|
Author-email: FutureHouse technical staff <hello@futurehouse.org>
|
6
6
|
License: Apache License
|
@@ -224,6 +224,7 @@ Requires-Dist: openai<1.100.0,>=1
|
|
224
224
|
Requires-Dist: pydantic
|
225
225
|
Requires-Dist: python-dotenv
|
226
226
|
Requires-Dist: requests
|
227
|
+
Requires-Dist: sqlalchemy-utils>=0.41.2
|
227
228
|
Requires-Dist: tenacity
|
228
229
|
Requires-Dist: tqdm>=4.62
|
229
230
|
Provides-Extra: dev
|
@@ -35,10 +35,12 @@ from futurehouse_client.models.data_storage_methods import (
|
|
35
35
|
DataStorageResponse,
|
36
36
|
DataStorageType,
|
37
37
|
DirectoryManifest,
|
38
|
+
GetDatasetAndEntriesResponse,
|
38
39
|
ManifestEntry,
|
39
40
|
)
|
40
41
|
from futurehouse_client.models.rest import (
|
41
42
|
DataStorageSearchPayload,
|
43
|
+
FilterLogic,
|
42
44
|
SearchCriterion,
|
43
45
|
)
|
44
46
|
from futurehouse_client.utils.general import retry_if_connection_error
|
@@ -779,6 +781,7 @@ class DataStorageMethods:
|
|
779
781
|
ignore_patterns: list[str] | None = None,
|
780
782
|
ignore_filename: str = ".gitignore",
|
781
783
|
project_id: UUID | None = None,
|
784
|
+
tags: list[str] | None = None,
|
782
785
|
) -> DataStorageResponse:
|
783
786
|
"""Upload a directory as a single zip file collection.
|
784
787
|
|
@@ -790,6 +793,7 @@ class DataStorageMethods:
|
|
790
793
|
ignore_patterns: List of patterns to ignore when zipping
|
791
794
|
ignore_filename: Name of ignore file to read from directory
|
792
795
|
project_id: ID of the project this data storage entry belongs to
|
796
|
+
tags: List of tags to associate with the data storage entry
|
793
797
|
|
794
798
|
Returns:
|
795
799
|
DataStorageResponse for the uploaded zip file
|
@@ -810,6 +814,7 @@ class DataStorageMethods:
|
|
810
814
|
path=zip_gcs_path,
|
811
815
|
is_collection=True,
|
812
816
|
project_id=project_id,
|
817
|
+
tags=tags,
|
813
818
|
)
|
814
819
|
|
815
820
|
logger.debug(
|
@@ -859,6 +864,7 @@ class DataStorageMethods:
|
|
859
864
|
ignore_patterns: list[str] | None = None,
|
860
865
|
ignore_filename: str = ".gitignore",
|
861
866
|
project_id: UUID | None = None,
|
867
|
+
tags: list[str] | None = None,
|
862
868
|
) -> DataStorageResponse:
|
863
869
|
"""Asynchronously upload a directory as a single zip file.
|
864
870
|
|
@@ -870,6 +876,7 @@ class DataStorageMethods:
|
|
870
876
|
ignore_patterns: List of patterns to ignore when zipping
|
871
877
|
ignore_filename: Name of ignore file to read from directory
|
872
878
|
project_id: ID of the project this data storage entry belongs to
|
879
|
+
tags: List of tags to associate with the data storage entry
|
873
880
|
|
874
881
|
Returns:
|
875
882
|
DataStorageResponse for the uploaded zip file
|
@@ -890,6 +897,7 @@ class DataStorageMethods:
|
|
890
897
|
path=zip_gcs_path,
|
891
898
|
is_collection=True,
|
892
899
|
project_id=project_id,
|
900
|
+
tags=tags,
|
893
901
|
)
|
894
902
|
|
895
903
|
data_storage_response = await self._acreate_data_storage_entry(payload)
|
@@ -1524,7 +1532,33 @@ class DataStorageMethods:
|
|
1524
1532
|
project_id: ID of the project this data storage entry belongs to
|
1525
1533
|
|
1526
1534
|
Returns:
|
1527
|
-
DataStorageResponse
|
1535
|
+
DataStorageResponse: A Pydantic model containing:
|
1536
|
+
- data_storage: DataStorageEntry with fields:
|
1537
|
+
- id - Unique identifier for the data storage entry
|
1538
|
+
- name - Name of the data storage entry
|
1539
|
+
- description - Description of the data storage entry
|
1540
|
+
- content - Content of the data storage entry
|
1541
|
+
- embedding - Embedding vector for the content
|
1542
|
+
- is_collection - Whether this entry is a collection
|
1543
|
+
- tags - List of tags associated with the entry
|
1544
|
+
- parent_id - ID of the parent entry for hierarchical storage
|
1545
|
+
- project_id - ID of the project this entry belongs to
|
1546
|
+
- dataset_id - ID of the dataset this entry belongs to
|
1547
|
+
- path - Path in the storage system where this entry is located
|
1548
|
+
- bigquery_schema - Target BigQuery schema for the entry
|
1549
|
+
- user_id - ID of the user who created this entry
|
1550
|
+
- created_at - Timestamp when the entry was created
|
1551
|
+
- modified_at - Timestamp when the entry was last updated
|
1552
|
+
- storage_locations with each location containing:
|
1553
|
+
- id - Unique identifier for the storage location
|
1554
|
+
- data_storage_id - ID of the associated data storage entry
|
1555
|
+
- storage_config pydantic model with fields:
|
1556
|
+
- storage_type - Type of storage (e.g., 'gcs', 'pg_table')
|
1557
|
+
- content_type - Type of content stored
|
1558
|
+
- content_schema - Content schema
|
1559
|
+
- metadata - Location metadata
|
1560
|
+
- location - Location path or identifier
|
1561
|
+
- signed_url - Signed URL for uploading/downloading
|
1528
1562
|
|
1529
1563
|
Raises:
|
1530
1564
|
DataStorageCreationError: If there's an error creating the data storage entry
|
@@ -1571,7 +1605,33 @@ class DataStorageMethods:
|
|
1571
1605
|
project_id: ID of the project this data storage entry belongs to
|
1572
1606
|
|
1573
1607
|
Returns:
|
1574
|
-
DataStorageResponse
|
1608
|
+
DataStorageResponse: A Pydantic model containing:
|
1609
|
+
- data_storage: DataStorageEntry with fields:
|
1610
|
+
- id - Unique identifier for the data storage entry
|
1611
|
+
- name - Name of the data storage entry
|
1612
|
+
- description - Description of the data storage entry
|
1613
|
+
- content - Content of the data storage entry
|
1614
|
+
- embedding - Embedding vector for the content
|
1615
|
+
- is_collection - Whether this entry is a collection
|
1616
|
+
- tags - List of tags associated with the entry
|
1617
|
+
- parent_id - ID of the parent entry for hierarchical storage
|
1618
|
+
- project_id - ID of the project this entry belongs to
|
1619
|
+
- dataset_id - ID of the dataset this entry belongs to
|
1620
|
+
- path - Path in the storage system where this entry is located
|
1621
|
+
- bigquery_schema - Target BigQuery schema for the entry
|
1622
|
+
- user_id - ID of the user who created this entry
|
1623
|
+
- created_at - Timestamp when the entry was created
|
1624
|
+
- modified_at - Timestamp when the entry was last updated
|
1625
|
+
- storage_locations with each location containing:
|
1626
|
+
- id - Unique identifier for the storage location
|
1627
|
+
- data_storage_id - ID of the associated data storage entry
|
1628
|
+
- storage_config pydantic model with fields:
|
1629
|
+
- storage_type - Type of storage (e.g., 'gcs', 'pg_table')
|
1630
|
+
- content_type - Type of content stored
|
1631
|
+
- content_schema - Content schema
|
1632
|
+
- metadata - Location metadata
|
1633
|
+
- location - Location path or identifier
|
1634
|
+
- signed_url - Signed URL for uploading/downloading
|
1575
1635
|
|
1576
1636
|
Raises:
|
1577
1637
|
DataStorageCreationError: If there's an error creating the data storage entry
|
@@ -1734,6 +1794,7 @@ class DataStorageMethods:
|
|
1734
1794
|
ignore_patterns: list[str] | None = None,
|
1735
1795
|
ignore_filename: str = ".gitignore",
|
1736
1796
|
project_id: UUID | None = None,
|
1797
|
+
dataset_id: UUID | None = None,
|
1737
1798
|
) -> DataStorageResponse:
|
1738
1799
|
"""Store file or directory content in the data storage system.
|
1739
1800
|
|
@@ -1749,13 +1810,45 @@ class DataStorageMethods:
|
|
1749
1810
|
description: Optional description of the data storage entry
|
1750
1811
|
path: Optional path for the data storage entry
|
1751
1812
|
as_collection: If true, upload directories as a single zip file collection.
|
1752
|
-
manifest_filename: Name of manifest file
|
1813
|
+
manifest_filename: Name of manifest file (JSON or YAML) containing:
|
1814
|
+
- entries - Map of file/directory names to their manifest entries
|
1815
|
+
- Each ManifestEntry contains:
|
1816
|
+
- description - Description of the file or directory
|
1817
|
+
- metadata - Additional metadata for the entry
|
1818
|
+
- Each DirectoryManifest contains nested entries following the same structure
|
1753
1819
|
ignore_patterns: List of patterns to ignore when zipping directories
|
1754
1820
|
ignore_filename: Name of ignore file to read from directory (default: .gitignore)
|
1755
1821
|
project_id: ID of the project this data storage entry belongs to
|
1822
|
+
dataset_id: ID of the dataset this data storage entry belongs to
|
1756
1823
|
|
1757
1824
|
Returns:
|
1758
|
-
DataStorageResponse
|
1825
|
+
DataStorageResponse: A Pydantic model containing:
|
1826
|
+
- data_storage: DataStorageEntry with fields:
|
1827
|
+
- id - Unique identifier for the data storage entry
|
1828
|
+
- name - Name of the data storage entry
|
1829
|
+
- description - Description of the data storage entry
|
1830
|
+
- content - Content of the data storage entry
|
1831
|
+
- embedding - Embedding vector for the content
|
1832
|
+
- is_collection - Whether this entry is a collection
|
1833
|
+
- tags - List of tags associated with the entry
|
1834
|
+
- parent_id - ID of the parent entry for hierarchical storage
|
1835
|
+
- project_id - ID of the project this entry belongs to
|
1836
|
+
- dataset_id - ID of the dataset this entry belongs to
|
1837
|
+
- path - Path in the storage system where this entry is located
|
1838
|
+
- bigquery_schema - Target BigQuery schema for the entry
|
1839
|
+
- user_id - ID of the user who created this entry
|
1840
|
+
- created_at - Timestamp when the entry was created
|
1841
|
+
- modified_at - Timestamp when the entry was last updated
|
1842
|
+
- storage_locations with each location containing:
|
1843
|
+
- id - Unique identifier for the storage location
|
1844
|
+
- data_storage_id - ID of the associated data storage entry
|
1845
|
+
- storage_config pydantic model with fields:
|
1846
|
+
- storage_type - Type of storage (e.g., 'gcs', 'pg_table')
|
1847
|
+
- content_type - Type of content stored
|
1848
|
+
- content_schema - Content schema
|
1849
|
+
- metadata - Location metadata
|
1850
|
+
- location - Location path or identifier
|
1851
|
+
- signed_url - Signed URL for uploading/downloading
|
1759
1852
|
|
1760
1853
|
Raises:
|
1761
1854
|
DataStorageCreationError: If there's an error in the process
|
@@ -1782,6 +1875,7 @@ class DataStorageMethods:
|
|
1782
1875
|
ignore_patterns=ignore_patterns,
|
1783
1876
|
ignore_filename=ignore_filename,
|
1784
1877
|
project_id=project_id,
|
1878
|
+
dataset_id=dataset_id,
|
1785
1879
|
)
|
1786
1880
|
if not responses:
|
1787
1881
|
raise DataStorageCreationError(
|
@@ -1827,15 +1921,47 @@ class DataStorageMethods:
|
|
1827
1921
|
path: Optional GCS path for the entry.
|
1828
1922
|
as_collection: If uploading a directory, `True` zips it into a single collection,
|
1829
1923
|
`False` uploads it as a hierarchical structure of individual objects.
|
1830
|
-
manifest_filename: Optional manifest file for hierarchical uploads
|
1924
|
+
manifest_filename: Optional manifest file (JSON or YAML) for hierarchical uploads containing:
|
1925
|
+
- entries - Map of file/directory names to their manifest entries
|
1926
|
+
- Each ManifestEntry contains:
|
1927
|
+
- description - Description of the file or directory
|
1928
|
+
- metadata - Additional metadata for the entry
|
1929
|
+
- Each DirectoryManifest contains nested entries following the same structure
|
1831
1930
|
ignore_patterns: List of patterns to ignore when zipping.
|
1832
1931
|
ignore_filename: Name of ignore file to read (default: .gitignore).
|
1833
1932
|
dataset_id: Optional dataset ID to add entry to, or None to create new dataset.
|
1834
1933
|
project_id: ID of the project this data storage entry belongs to
|
1835
1934
|
|
1836
1935
|
Returns:
|
1837
|
-
|
1838
|
-
|
1936
|
+
DataStorageResponse: A Pydantic model containing:
|
1937
|
+
- data_storage: DataStorageEntry with fields:
|
1938
|
+
- id - Unique identifier for the data storage entry
|
1939
|
+
- name - Name of the data storage entry
|
1940
|
+
- description - Description of the data storage entry
|
1941
|
+
- content - Content of the data storage entry
|
1942
|
+
- embedding - Embedding vector for the content
|
1943
|
+
- is_collection - Whether this entry is a collection
|
1944
|
+
- tags - List of tags associated with the entry
|
1945
|
+
- parent_id - ID of the parent entry for hierarchical storage
|
1946
|
+
- project_id - ID of the project this entry belongs to
|
1947
|
+
- dataset_id - ID of the dataset this entry belongs to
|
1948
|
+
- path - Path in the storage system where this entry is located
|
1949
|
+
- bigquery_schema - Target BigQuery schema for the entry
|
1950
|
+
- user_id - ID of the user who created this entry
|
1951
|
+
- created_at - Timestamp when the entry was created
|
1952
|
+
- modified_at - Timestamp when the entry was last updated
|
1953
|
+
- storage_locations with each location containing:
|
1954
|
+
- id - Unique identifier for the storage location
|
1955
|
+
- data_storage_id - ID of the associated data storage entry
|
1956
|
+
- storage_config pydantic model with fields:
|
1957
|
+
- storage_type - Type of storage (e.g., 'gcs', 'pg_table')
|
1958
|
+
- content_type - Type of content stored
|
1959
|
+
- content_schema - Content schema
|
1960
|
+
- metadata - Location metadata
|
1961
|
+
- location - Location path or identifier
|
1962
|
+
- signed_url - Signed URL for uploading/downloading
|
1963
|
+
|
1964
|
+
For hierarchical uploads, this is the response for the root directory entry.
|
1839
1965
|
"""
|
1840
1966
|
file_path = self._validate_file_path(file_path)
|
1841
1967
|
|
@@ -1896,7 +2022,12 @@ class DataStorageMethods:
|
|
1896
2022
|
|
1897
2023
|
Args:
|
1898
2024
|
name: Name of the data storage entry
|
1899
|
-
existing_location:
|
2025
|
+
existing_location: a pydantic model describing the existing data source location to register, containing:
|
2026
|
+
- storage_type - Type of storage (BIGQUERY, GCS, PG_TABLE, RAW_CONTENT, ELASTIC_SEARCH)
|
2027
|
+
- content_type - Type of content (BQ_DATASET, BQ_TABLE, TEXT, TEXT_W_EMBEDDINGS, DIRECTORY, FILE, INDEX, INDEX_W_EMBEDDINGS)
|
2028
|
+
- content_schema - Content schema for the data
|
2029
|
+
- metadata - Additional metadata for the location
|
2030
|
+
- location - Location path or identifier
|
1900
2031
|
description: Optional description of the data storage entry
|
1901
2032
|
as_collection: If uploading a directory, `True` creates a single storage entry for
|
1902
2033
|
the whole directory and multiple storage locations for each file, `False` assumes
|
@@ -1905,7 +2036,33 @@ class DataStorageMethods:
|
|
1905
2036
|
project_id: ID of the project this data storage entry belongs to
|
1906
2037
|
|
1907
2038
|
Returns:
|
1908
|
-
DataStorageResponse
|
2039
|
+
DataStorageResponse: A Pydantic model containing:
|
2040
|
+
- data_storage: DataStorageEntry with fields:
|
2041
|
+
- id - Unique identifier for the data storage entry
|
2042
|
+
- name - Name of the data storage entry
|
2043
|
+
- description - Description of the data storage entry
|
2044
|
+
- content - Content of the data storage entry
|
2045
|
+
- embedding - Embedding vector for the content
|
2046
|
+
- is_collection - Whether this entry is a collection
|
2047
|
+
- tags - List of tags associated with the entry
|
2048
|
+
- parent_id - ID of the parent entry for hierarchical storage
|
2049
|
+
- project_id - ID of the project this entry belongs to
|
2050
|
+
- dataset_id - ID of the dataset this entry belongs to
|
2051
|
+
- path - Path in the storage system where this entry is located
|
2052
|
+
- bigquery_schema - Target BigQuery schema for the entry
|
2053
|
+
- user_id - ID of the user who created this entry
|
2054
|
+
- created_at - Timestamp when the entry was created
|
2055
|
+
- modified_at - Timestamp when the entry was last updated
|
2056
|
+
- storage_locations with each location containing:
|
2057
|
+
- id - Unique identifier for the storage location
|
2058
|
+
- data_storage_id - ID of the associated data storage entry
|
2059
|
+
- storage_config pydantic model with fields:
|
2060
|
+
- storage_type - Type of storage (e.g., 'gcs', 'pg_table')
|
2061
|
+
- content_type - Type of content stored
|
2062
|
+
- content_schema - Content schema
|
2063
|
+
- metadata - Location metadata
|
2064
|
+
- location - Location path or identifier
|
2065
|
+
- signed_url - Signed URL for uploading/downloading
|
1909
2066
|
|
1910
2067
|
Raises:
|
1911
2068
|
DataStorageCreationError: If there's an error creating the data storage entry
|
@@ -1951,7 +2108,12 @@ class DataStorageMethods:
|
|
1951
2108
|
|
1952
2109
|
Args:
|
1953
2110
|
name: Name of the data storage entry
|
1954
|
-
existing_location:
|
2111
|
+
existing_location: a pydantic model describing the existing data source location to register, containing:
|
2112
|
+
- storage_type - Type of storage (BIGQUERY, GCS, PG_TABLE, RAW_CONTENT, ELASTIC_SEARCH)
|
2113
|
+
- content_type - Type of content (BQ_DATASET, BQ_TABLE, TEXT, TEXT_W_EMBEDDINGS, DIRECTORY, FILE, INDEX, INDEX_W_EMBEDDINGS)
|
2114
|
+
- content_schema - Content schema for the data
|
2115
|
+
- metadata - Additional metadata for the location
|
2116
|
+
- location - Location path or identifier
|
1955
2117
|
description: Optional description of the data storage entry
|
1956
2118
|
as_collection: If uploading a directory, `True` creates a single storage entry for
|
1957
2119
|
the whole directory and multiple storage locations for each file, `False` assumes
|
@@ -1960,7 +2122,33 @@ class DataStorageMethods:
|
|
1960
2122
|
project_id: ID of the project this data storage entry belongs to
|
1961
2123
|
|
1962
2124
|
Returns:
|
1963
|
-
DataStorageResponse
|
2125
|
+
DataStorageResponse: A Pydantic model containing:
|
2126
|
+
- data_storage: DataStorageEntry with fields:
|
2127
|
+
- id - Unique identifier for the data storage entry
|
2128
|
+
- name - Name of the data storage entry
|
2129
|
+
- description - Description of the data storage entry
|
2130
|
+
- content - Content of the data storage entry
|
2131
|
+
- embedding - Embedding vector for the content
|
2132
|
+
- is_collection - Whether this entry is a collection
|
2133
|
+
- tags - List of tags associated with the entry
|
2134
|
+
- parent_id - ID of the parent entry for hierarchical storage
|
2135
|
+
- project_id - ID of the project this entry belongs to
|
2136
|
+
- dataset_id - ID of the dataset this entry belongs to
|
2137
|
+
- path - Path in the storage system where this entry is located
|
2138
|
+
- bigquery_schema - Target BigQuery schema for the entry
|
2139
|
+
- user_id - ID of the user who created this entry
|
2140
|
+
- created_at - Timestamp when the entry was created
|
2141
|
+
- modified_at - Timestamp when the entry was last updated
|
2142
|
+
- storage_locations with each location containing:
|
2143
|
+
- id - Unique identifier for the storage location
|
2144
|
+
- data_storage_id - ID of the associated data storage entry
|
2145
|
+
- storage_config pydantic model with fields:
|
2146
|
+
- storage_type - Type of storage (e.g., 'gcs', 'pg_table')
|
2147
|
+
- content_type - Type of content stored
|
2148
|
+
- content_schema - Content schema
|
2149
|
+
- metadata - Location metadata
|
2150
|
+
- location - Location path or identifier
|
2151
|
+
- signed_url - Signed URL for uploading/downloading
|
1964
2152
|
|
1965
2153
|
Raises:
|
1966
2154
|
DataStorageCreationError: If there's an error creating the data storage entry
|
@@ -1997,12 +2185,17 @@ class DataStorageMethods:
|
|
1997
2185
|
self,
|
1998
2186
|
criteria: list[SearchCriterion] | None = None,
|
1999
2187
|
size: int = 10,
|
2188
|
+
filter_logic: FilterLogic = FilterLogic.OR,
|
2000
2189
|
) -> list[dict]:
|
2001
2190
|
"""Search data storage objects using structured criteria.
|
2002
2191
|
|
2003
2192
|
Args:
|
2004
|
-
criteria: List of
|
2193
|
+
criteria: List of SearchCriterion pydantic models with fields:
|
2194
|
+
- field - Field name to search on
|
2195
|
+
- operator - Search operator (EQUALS, CONTAINS, STARTS_WITH, ENDS_WITH, GREATER_THAN, LESS_THAN, BETWEEN, IN)
|
2196
|
+
- value - Value to search for
|
2005
2197
|
size: Number of results to return (1-100)
|
2198
|
+
filter_logic: Either "AND" (all criteria must match) or "OR" (at least one must match)
|
2006
2199
|
|
2007
2200
|
Returns:
|
2008
2201
|
List of search results with scores and data storage information
|
@@ -2023,6 +2216,7 @@ class DataStorageMethods:
|
|
2023
2216
|
payload = DataStorageSearchPayload(
|
2024
2217
|
criteria=criteria or [],
|
2025
2218
|
size=max(1, min(100, size)), # Clamp between 1-100
|
2219
|
+
filter_logic=filter_logic,
|
2026
2220
|
)
|
2027
2221
|
|
2028
2222
|
response = self.client.post(
|
@@ -2053,12 +2247,17 @@ class DataStorageMethods:
|
|
2053
2247
|
self,
|
2054
2248
|
criteria: list[SearchCriterion] | None = None,
|
2055
2249
|
size: int = 10,
|
2250
|
+
filter_logic: FilterLogic = FilterLogic.OR,
|
2056
2251
|
) -> list[dict]:
|
2057
2252
|
"""Asynchronously search data storage objects using structured criteria.
|
2058
2253
|
|
2059
2254
|
Args:
|
2060
|
-
criteria: List of
|
2255
|
+
criteria: List of SearchCriterion pydantic models with fields:
|
2256
|
+
- field - Field name to search on
|
2257
|
+
- operator - Search operator (EQUALS, CONTAINS, STARTS_WITH, ENDS_WITH, GREATER_THAN, LESS_THAN, BETWEEN, IN)
|
2258
|
+
- value - Value to search for
|
2061
2259
|
size: Number of results to return (1-100)
|
2260
|
+
filter_logic: Either "AND" (all criteria must match) or "OR" (at least one must match)
|
2062
2261
|
|
2063
2262
|
Returns:
|
2064
2263
|
List of search results with scores and data storage information
|
@@ -2079,6 +2278,7 @@ class DataStorageMethods:
|
|
2079
2278
|
payload = DataStorageSearchPayload(
|
2080
2279
|
criteria=criteria or [],
|
2081
2280
|
size=max(1, min(100, size)), # Clamp between 1-100
|
2281
|
+
filter_logic=filter_logic,
|
2082
2282
|
)
|
2083
2283
|
|
2084
2284
|
response = await self.async_client.post(
|
@@ -2118,11 +2318,11 @@ class DataStorageMethods:
|
|
2118
2318
|
"""Search data storage objects using vector similarity.
|
2119
2319
|
|
2120
2320
|
Args:
|
2121
|
-
embedding:
|
2321
|
+
embedding: List of float values representing the embedding vector for similarity search
|
2122
2322
|
size: Number of results to return (1-100)
|
2123
2323
|
min_score: Minimum similarity score (0.0-1.0)
|
2124
2324
|
dataset_id: Optional dataset ID filter
|
2125
|
-
tags: Optional list of tags to filter by
|
2325
|
+
tags: Optional list of string tags to filter by
|
2126
2326
|
user_id: Optional user ID filter (admin only)
|
2127
2327
|
project_id: Optional project ID filter
|
2128
2328
|
|
@@ -2196,11 +2396,11 @@ class DataStorageMethods:
|
|
2196
2396
|
"""Asynchronously search data storage objects using vector similarity.
|
2197
2397
|
|
2198
2398
|
Args:
|
2199
|
-
embedding:
|
2399
|
+
embedding: List of float values representing the embedding vector for similarity search
|
2200
2400
|
size: Number of results to return (1-100)
|
2201
2401
|
min_score: Minimum similarity score (0.0-1.0)
|
2202
2402
|
dataset_id: Optional dataset ID filter
|
2203
|
-
tags: Optional list of tags to filter by
|
2403
|
+
tags: Optional list of string tags to filter by
|
2204
2404
|
user_id: Optional user ID filter (admin only)
|
2205
2405
|
project_id: Optional project ID filter
|
2206
2406
|
|
@@ -2268,12 +2468,12 @@ class DataStorageMethods:
|
|
2268
2468
|
"""Fetch data from the storage system (sync version).
|
2269
2469
|
|
2270
2470
|
Args:
|
2271
|
-
data_storage_id:
|
2471
|
+
data_storage_id: UUID of the data storage entry to fetch
|
2272
2472
|
|
2273
2473
|
Returns:
|
2274
2474
|
For PG_TABLE storage: string content
|
2275
2475
|
For GCS storage: Path to downloaded file (may be unzipped if it was a zip)
|
2276
|
-
For multi-location entries:
|
2476
|
+
For multi-location entries: list of downloaded files
|
2277
2477
|
None if not found or error occurred
|
2278
2478
|
"""
|
2279
2479
|
if not data_storage_id:
|
@@ -2342,12 +2542,12 @@ class DataStorageMethods:
|
|
2342
2542
|
"""Fetch data from the storage system.
|
2343
2543
|
|
2344
2544
|
Args:
|
2345
|
-
data_storage_id:
|
2545
|
+
data_storage_id: UUID of the data storage entry to fetch
|
2346
2546
|
|
2347
2547
|
Returns:
|
2348
2548
|
For PG_TABLE storage: string content
|
2349
2549
|
For GCS storage: Path to downloaded file (may be unzipped if it was a zip)
|
2350
|
-
For multi-location entries:
|
2550
|
+
For multi-location entries: list of downloaded files
|
2351
2551
|
None if not found or error occurred
|
2352
2552
|
"""
|
2353
2553
|
if not data_storage_id:
|
@@ -2417,7 +2617,23 @@ class DataStorageMethods:
|
|
2417
2617
|
name: str,
|
2418
2618
|
description: str | None = None,
|
2419
2619
|
dataset_id: UUID | None = None,
|
2420
|
-
):
|
2620
|
+
) -> CreateDatasetPayload:
|
2621
|
+
"""Asynchronously create a new dataset.
|
2622
|
+
|
2623
|
+
Args:
|
2624
|
+
name: Name of the dataset to create
|
2625
|
+
description: Optional description of the dataset
|
2626
|
+
dataset_id: Optional UUID to assign to the dataset, or None to auto-generate
|
2627
|
+
|
2628
|
+
Returns:
|
2629
|
+
CreateDatasetPayload: A Pydantic model containing:
|
2630
|
+
- id - ID of the created dataset (None if auto-generated)
|
2631
|
+
- name - Name of the dataset
|
2632
|
+
- description - Description of the dataset
|
2633
|
+
|
2634
|
+
Raises:
|
2635
|
+
DataStorageCreationError: If there's an error creating the dataset
|
2636
|
+
"""
|
2421
2637
|
try:
|
2422
2638
|
payload = CreateDatasetPayload(
|
2423
2639
|
name=name,
|
@@ -2448,7 +2664,23 @@ class DataStorageMethods:
|
|
2448
2664
|
name: str,
|
2449
2665
|
description: str | None = None,
|
2450
2666
|
dataset_id: UUID | None = None,
|
2451
|
-
):
|
2667
|
+
) -> CreateDatasetPayload:
|
2668
|
+
"""Create a new dataset.
|
2669
|
+
|
2670
|
+
Args:
|
2671
|
+
name: Name of the dataset to create
|
2672
|
+
description: Optional description of the dataset
|
2673
|
+
dataset_id: Optional UUID to assign to the dataset, or None to auto-generate
|
2674
|
+
|
2675
|
+
Returns:
|
2676
|
+
CreateDatasetPayload: A Pydantic model containing:
|
2677
|
+
- id - ID of the created dataset (None if auto-generated)
|
2678
|
+
- name - Name of the dataset
|
2679
|
+
- description - Description of the dataset
|
2680
|
+
|
2681
|
+
Raises:
|
2682
|
+
DataStorageCreationError: If there's an error creating the dataset
|
2683
|
+
"""
|
2452
2684
|
try:
|
2453
2685
|
payload = CreateDatasetPayload(
|
2454
2686
|
name=name,
|
@@ -2522,14 +2754,48 @@ class DataStorageMethods:
|
|
2522
2754
|
retry=retry_if_connection_error,
|
2523
2755
|
before_sleep=before_sleep_log(logger, logging.WARNING),
|
2524
2756
|
)
|
2525
|
-
async def aget_dataset(self, dataset_id: UUID):
|
2757
|
+
async def aget_dataset(self, dataset_id: UUID) -> GetDatasetAndEntriesResponse:
|
2758
|
+
"""Asynchronously retrieve a dataset by ID.
|
2759
|
+
|
2760
|
+
Args:
|
2761
|
+
dataset_id: UUID of the dataset to retrieve
|
2762
|
+
|
2763
|
+
Returns:
|
2764
|
+
GetDatasetAndEntriesResponse: A dict containing:
|
2765
|
+
- dataset: DatasetStorage with fields:
|
2766
|
+
- id - Unique identifier for the dataset
|
2767
|
+
- name - Name of the dataset
|
2768
|
+
- user_id - ID of the user who created the dataset
|
2769
|
+
- description - Description of the dataset
|
2770
|
+
- created_at - Timestamp when the dataset was created
|
2771
|
+
- modified_at - Timestamp when the dataset was last modified
|
2772
|
+
- data_storage_entries - List of data storage entries in the dataset, each containing:
|
2773
|
+
- id - Unique identifier for the data storage entry
|
2774
|
+
- name - Name of the data storage entry
|
2775
|
+
- description - Description of the data storage entry
|
2776
|
+
- content - Content of the data storage entry
|
2777
|
+
- embedding - Embedding vector for the content
|
2778
|
+
- is_collection - Whether this entry is a collection
|
2779
|
+
- tags - List of tags associated with the entry
|
2780
|
+
- parent_id - ID of the parent entry for hierarchical storage
|
2781
|
+
- project_id - ID of the project this entry belongs to
|
2782
|
+
- dataset_id - ID of the dataset this entry belongs to
|
2783
|
+
- path - Path in the storage system where this entry is located
|
2784
|
+
- bigquery_schema - Target BigQuery schema for the entry
|
2785
|
+
- user_id - ID of the user who created this entry
|
2786
|
+
- created_at - Timestamp when the entry was created
|
2787
|
+
- modified_at - Timestamp when the entry was last updated
|
2788
|
+
|
2789
|
+
Raises:
|
2790
|
+
DataStorageError: If there's an error retrieving the dataset
|
2791
|
+
"""
|
2526
2792
|
try:
|
2527
2793
|
response = await self.async_client.get(
|
2528
2794
|
f"/v0.1/data-storage/datasets/{dataset_id}"
|
2529
2795
|
)
|
2530
2796
|
response.raise_for_status()
|
2531
2797
|
|
2532
|
-
return response.json()
|
2798
|
+
return GetDatasetAndEntriesResponse.model_validate(response.json())
|
2533
2799
|
except HTTPStatusError as e:
|
2534
2800
|
self._handle_http_errors(e, "retrieving")
|
2535
2801
|
except Exception as e:
|
@@ -2541,12 +2807,46 @@ class DataStorageMethods:
|
|
2541
2807
|
retry=retry_if_connection_error,
|
2542
2808
|
before_sleep=before_sleep_log(logger, logging.WARNING),
|
2543
2809
|
)
|
2544
|
-
def get_dataset(self, dataset_id: UUID):
|
2810
|
+
def get_dataset(self, dataset_id: UUID) -> GetDatasetAndEntriesResponse:
|
2811
|
+
"""Retrieve a dataset by ID.
|
2812
|
+
|
2813
|
+
Args:
|
2814
|
+
dataset_id: UUID of the dataset to retrieve
|
2815
|
+
|
2816
|
+
Returns:
|
2817
|
+
GetDatasetAndEntriesResponse: A dict containing:
|
2818
|
+
- dataset: DatasetStorage with fields:
|
2819
|
+
- id - Unique identifier for the dataset
|
2820
|
+
- name - Name of the dataset
|
2821
|
+
- user_id - ID of the user who created the dataset
|
2822
|
+
- description - Description of the dataset
|
2823
|
+
- created_at - Timestamp when the dataset was created
|
2824
|
+
- modified_at - Timestamp when the dataset was last modified
|
2825
|
+
- data_storage_entries - List of data storage entries in the dataset, each containing:
|
2826
|
+
- id - Unique identifier for the data storage entry
|
2827
|
+
- name - Name of the data storage entry
|
2828
|
+
- description - Description of the data storage entry
|
2829
|
+
- content - Content of the data storage entry
|
2830
|
+
- embedding - Embedding vector for the content
|
2831
|
+
- is_collection - Whether this entry is a collection
|
2832
|
+
- tags - List of tags associated with the entry
|
2833
|
+
- parent_id - ID of the parent entry for hierarchical storage
|
2834
|
+
- project_id - ID of the project this entry belongs to
|
2835
|
+
- dataset_id - ID of the dataset this entry belongs to
|
2836
|
+
- path - Path in the storage system where this entry is located
|
2837
|
+
- bigquery_schema - Target BigQuery schema for the entry
|
2838
|
+
- user_id - ID of the user who created this entry
|
2839
|
+
- created_at - Timestamp when the entry was created
|
2840
|
+
- modified_at - Timestamp when the entry was last updated
|
2841
|
+
|
2842
|
+
Raises:
|
2843
|
+
DataStorageError: If there's an error retrieving the dataset
|
2844
|
+
"""
|
2545
2845
|
try:
|
2546
2846
|
response = self.client.get(f"/v0.1/data-storage/datasets/{dataset_id}")
|
2547
2847
|
response.raise_for_status()
|
2548
2848
|
|
2549
|
-
return response.json()
|
2849
|
+
return GetDatasetAndEntriesResponse.model_validate(response.json())
|
2550
2850
|
except HTTPStatusError as e:
|
2551
2851
|
self._handle_http_errors(e, "retrieving")
|
2552
2852
|
except Exception as e:
|
@@ -2622,7 +2922,15 @@ class DataStorageMethods:
|
|
2622
2922
|
retry=retry_if_connection_error,
|
2623
2923
|
before_sleep=before_sleep_log(logger, logging.WARNING),
|
2624
2924
|
)
|
2625
|
-
async def adelete_data_storage_entry(self, data_storage_entry_id: UUID):
|
2925
|
+
async def adelete_data_storage_entry(self, data_storage_entry_id: UUID) -> None:
|
2926
|
+
"""Asynchronously delete a data storage entry.
|
2927
|
+
|
2928
|
+
Args:
|
2929
|
+
data_storage_entry_id: UUID of the data storage entry to delete
|
2930
|
+
|
2931
|
+
Raises:
|
2932
|
+
DataStorageError: If there's an error deleting the data storage entry
|
2933
|
+
"""
|
2626
2934
|
try:
|
2627
2935
|
await self.async_client.delete(
|
2628
2936
|
f"/v0.1/data-storage/data-entries/{data_storage_entry_id}"
|
@@ -2638,7 +2946,15 @@ class DataStorageMethods:
|
|
2638
2946
|
retry=retry_if_connection_error,
|
2639
2947
|
before_sleep=before_sleep_log(logger, logging.WARNING),
|
2640
2948
|
)
|
2641
|
-
def delete_data_storage_entry(self, data_storage_entry_id: UUID):
|
2949
|
+
def delete_data_storage_entry(self, data_storage_entry_id: UUID) -> None:
|
2950
|
+
"""Delete a data storage entry.
|
2951
|
+
|
2952
|
+
Args:
|
2953
|
+
data_storage_entry_id: UUID of the data storage entry to delete
|
2954
|
+
|
2955
|
+
Raises:
|
2956
|
+
DataStorageError: If there's an error deleting the data storage entry
|
2957
|
+
"""
|
2642
2958
|
try:
|
2643
2959
|
self.client.delete(
|
2644
2960
|
f"/v0.1/data-storage/data-entries/{data_storage_entry_id}"
|