unstructured-ingest 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (52) hide show
  1. test/integration/connectors/{databricks_tests → databricks}/test_volumes_native.py +75 -19
  2. test/integration/connectors/sql/test_postgres.py +9 -5
  3. test/integration/connectors/sql/test_singlestore.py +9 -5
  4. test/integration/connectors/sql/test_snowflake.py +6 -2
  5. test/integration/connectors/sql/test_sqlite.py +9 -5
  6. test/integration/connectors/test_astradb.py +40 -0
  7. test/integration/connectors/test_kafka.py +2 -2
  8. test/integration/connectors/test_mongodb.py +4 -1
  9. test/integration/connectors/utils/validation/source.py +31 -11
  10. unstructured_ingest/__version__.py +1 -1
  11. unstructured_ingest/v2/interfaces/__init__.py +3 -1
  12. unstructured_ingest/v2/interfaces/file_data.py +69 -15
  13. unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
  14. unstructured_ingest/v2/pipeline/steps/download.py +5 -4
  15. unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
  16. unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
  17. unstructured_ingest/v2/pipeline/steps/index.py +4 -4
  18. unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
  19. unstructured_ingest/v2/pipeline/steps/stage.py +2 -2
  20. unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
  21. unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
  22. unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
  23. unstructured_ingest/v2/processes/connectors/astradb.py +37 -33
  24. unstructured_ingest/v2/processes/connectors/couchbase.py +52 -41
  25. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +5 -0
  26. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +2 -2
  27. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +2 -2
  28. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +2 -2
  29. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +2 -2
  30. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +41 -45
  31. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
  32. unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
  33. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
  34. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
  35. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
  36. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
  37. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
  38. unstructured_ingest/v2/processes/connectors/mongodb.py +94 -100
  39. unstructured_ingest/v2/processes/connectors/neo4j.py +5 -3
  40. unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
  41. unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
  42. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
  43. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
  44. unstructured_ingest/v2/processes/connectors/sql/sql.py +36 -26
  45. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
  46. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/METADATA +11 -10
  47. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/RECORD +52 -52
  48. /test/integration/connectors/{databricks_tests → databricks}/__init__.py +0 -0
  49. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/LICENSE.md +0 -0
  50. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/WHEEL +0 -0
  51. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/entry_points.txt +0 -0
  52. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,10 @@
1
1
  import json
2
2
  import os
3
- import tempfile
4
3
  import uuid
5
4
  from contextlib import contextmanager
6
5
  from dataclasses import dataclass
7
6
  from pathlib import Path
7
+ from unittest import mock
8
8
 
9
9
  import pytest
10
10
  from databricks.sdk import WorkspaceClient
@@ -31,11 +31,15 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes_native impor
31
31
 
32
32
 
33
33
  @dataclass
34
- class EnvData:
34
+ class BaseEnvData:
35
35
  host: str
36
+ catalog: str
37
+
38
+
39
+ @dataclass
40
+ class BasicAuthEnvData(BaseEnvData):
36
41
  client_id: str
37
42
  client_secret: str
38
- catalog: str
39
43
 
40
44
  def get_connection_config(self) -> DatabricksNativeVolumesConnectionConfig:
41
45
  return DatabricksNativeVolumesConnectionConfig(
@@ -47,8 +51,21 @@ class EnvData:
47
51
  )
48
52
 
49
53
 
50
- def get_env_data() -> EnvData:
51
- return EnvData(
54
+ @dataclass
55
+ class PATEnvData(BaseEnvData):
56
+ token: str
57
+
58
+ def get_connection_config(self) -> DatabricksNativeVolumesConnectionConfig:
59
+ return DatabricksNativeVolumesConnectionConfig(
60
+ host=self.host,
61
+ access_config=DatabricksNativeVolumesAccessConfig(
62
+ token=self.token,
63
+ ),
64
+ )
65
+
66
+
67
+ def get_basic_auth_env_data() -> BasicAuthEnvData:
68
+ return BasicAuthEnvData(
52
69
  host=os.environ["DATABRICKS_HOST"],
53
70
  client_id=os.environ["DATABRICKS_CLIENT_ID"],
54
71
  client_secret=os.environ["DATABRICKS_CLIENT_SECRET"],
@@ -56,23 +73,30 @@ def get_env_data() -> EnvData:
56
73
  )
57
74
 
58
75
 
76
+ def get_pat_env_data() -> PATEnvData:
77
+ return PATEnvData(
78
+ host=os.environ["DATABRICKS_HOST"],
79
+ catalog=os.environ["DATABRICKS_CATALOG"],
80
+ token=os.environ["DATABRICKS_PAT"],
81
+ )
82
+
83
+
59
84
  @pytest.mark.asyncio
60
85
  @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
61
86
  @requires_env(
62
87
  "DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
63
88
  )
64
- async def test_volumes_native_source():
65
- env_data = get_env_data()
66
- indexer_config = DatabricksNativeVolumesIndexerConfig(
67
- recursive=True,
68
- volume="test-platform",
69
- volume_path="databricks-volumes-test-input",
70
- catalog=env_data.catalog,
71
- )
72
- connection_config = env_data.get_connection_config()
73
- with tempfile.TemporaryDirectory() as tempdir:
74
- tempdir_path = Path(tempdir)
75
- download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tempdir_path)
89
+ async def test_volumes_native_source(tmp_path: Path):
90
+ env_data = get_basic_auth_env_data()
91
+ with mock.patch.dict(os.environ, clear=True):
92
+ indexer_config = DatabricksNativeVolumesIndexerConfig(
93
+ recursive=True,
94
+ volume="test-platform",
95
+ volume_path="databricks-volumes-test-input",
96
+ catalog=env_data.catalog,
97
+ )
98
+ connection_config = env_data.get_connection_config()
99
+ download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tmp_path)
76
100
  indexer = DatabricksNativeVolumesIndexer(
77
101
  connection_config=connection_config, index_config=indexer_config
78
102
  )
@@ -89,12 +113,44 @@ async def test_volumes_native_source():
89
113
  )
90
114
 
91
115
 
116
+ @pytest.mark.asyncio
117
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
118
+ @requires_env("DATABRICKS_HOST", "DATABRICKS_PAT", "DATABRICKS_CATALOG")
119
+ async def test_volumes_native_source_pat(tmp_path: Path):
120
+ env_data = get_pat_env_data()
121
+ with mock.patch.dict(os.environ, clear=True):
122
+ indexer_config = DatabricksNativeVolumesIndexerConfig(
123
+ recursive=True,
124
+ volume="test-platform",
125
+ volume_path="databricks-volumes-test-input",
126
+ catalog=env_data.catalog,
127
+ )
128
+ connection_config = env_data.get_connection_config()
129
+ download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tmp_path)
130
+ indexer = DatabricksNativeVolumesIndexer(
131
+ connection_config=connection_config, index_config=indexer_config
132
+ )
133
+ downloader = DatabricksNativeVolumesDownloader(
134
+ connection_config=connection_config, download_config=download_config
135
+ )
136
+ await source_connector_validation(
137
+ indexer=indexer,
138
+ downloader=downloader,
139
+ configs=SourceValidationConfigs(
140
+ test_id="databricks_volumes_native_pat",
141
+ expected_num_files=1,
142
+ ),
143
+ )
144
+
145
+
92
146
  def _get_volume_path(catalog: str, volume: str, volume_path: str):
93
147
  return f"/Volumes/{catalog}/default/{volume}/{volume_path}"
94
148
 
95
149
 
96
150
  @contextmanager
97
- def databricks_destination_context(env_data: EnvData, volume: str, volume_path) -> WorkspaceClient:
151
+ def databricks_destination_context(
152
+ env_data: BasicAuthEnvData, volume: str, volume_path
153
+ ) -> WorkspaceClient:
98
154
  client = WorkspaceClient(
99
155
  host=env_data.host, client_id=env_data.client_id, client_secret=env_data.client_secret
100
156
  )
@@ -137,7 +193,7 @@ def validate_upload(client: WorkspaceClient, catalog: str, volume: str, volume_p
137
193
  "DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
138
194
  )
139
195
  async def test_volumes_native_destination(upload_file: Path):
140
- env_data = get_env_data()
196
+ env_data = get_basic_auth_env_data()
141
197
  volume_path = f"databricks-volumes-test-output-{uuid.uuid4()}"
142
198
  file_data = FileData(
143
199
  source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
@@ -15,7 +15,7 @@ from test.integration.connectors.utils.validation.source import (
15
15
  SourceValidationConfigs,
16
16
  source_connector_validation,
17
17
  )
18
- from unstructured_ingest.v2.interfaces import FileData
18
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
19
19
  from unstructured_ingest.v2.processes.connectors.sql.postgres import (
20
20
  CONNECTOR_TYPE,
21
21
  PostgresAccessConfig,
@@ -28,7 +28,7 @@ from unstructured_ingest.v2.processes.connectors.sql.postgres import (
28
28
  PostgresUploadStager,
29
29
  )
30
30
 
31
- SEED_DATA_ROWS = 20
31
+ SEED_DATA_ROWS = 10
32
32
 
33
33
 
34
34
  @pytest.fixture
@@ -69,7 +69,7 @@ async def test_postgres_source(temp_dir: Path, source_database_setup: str):
69
69
  )
70
70
  indexer = PostgresIndexer(
71
71
  connection_config=connection_config,
72
- index_config=PostgresIndexerConfig(table_name="cars", id_column="car_id", batch_size=5),
72
+ index_config=PostgresIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
73
73
  )
74
74
  downloader = PostgresDownloader(
75
75
  connection_config=connection_config,
@@ -81,7 +81,7 @@ async def test_postgres_source(temp_dir: Path, source_database_setup: str):
81
81
  configs=SourceValidationConfigs(
82
82
  test_id="postgres",
83
83
  expected_num_files=SEED_DATA_ROWS,
84
- expected_number_indexed_file_data=4,
84
+ expected_number_indexed_file_data=2,
85
85
  validate_downloaded_files=True,
86
86
  ),
87
87
  )
@@ -119,7 +119,11 @@ def validate_destination(
119
119
  async def test_postgres_destination(upload_file: Path, temp_dir: Path):
120
120
  # the postgres destination connector doesn't leverage the file data but is required as an input,
121
121
  # mocking it with arbitrary values to meet the base requirements:
122
- mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
122
+ mock_file_data = FileData(
123
+ identifier="mock file data",
124
+ connector_type=CONNECTOR_TYPE,
125
+ source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
126
+ )
123
127
  with docker_compose_context(
124
128
  docker_compose_path=env_setup_path / "sql" / "postgres" / "destination"
125
129
  ):
@@ -15,7 +15,7 @@ from test.integration.connectors.utils.validation.source import (
15
15
  SourceValidationConfigs,
16
16
  source_connector_validation,
17
17
  )
18
- from unstructured_ingest.v2.interfaces import FileData
18
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
19
19
  from unstructured_ingest.v2.processes.connectors.sql.singlestore import (
20
20
  CONNECTOR_TYPE,
21
21
  SingleStoreAccessConfig,
@@ -29,7 +29,7 @@ from unstructured_ingest.v2.processes.connectors.sql.singlestore import (
29
29
  SingleStoreUploadStager,
30
30
  )
31
31
 
32
- SEED_DATA_ROWS = 20
32
+ SEED_DATA_ROWS = 10
33
33
 
34
34
 
35
35
  @pytest.fixture
@@ -66,7 +66,7 @@ async def test_singlestore_source(temp_dir: Path, source_database_setup: dict):
66
66
  )
67
67
  indexer = SingleStoreIndexer(
68
68
  connection_config=connection_config,
69
- index_config=SingleStoreIndexerConfig(table_name="cars", id_column="car_id", batch_size=5),
69
+ index_config=SingleStoreIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
70
70
  )
71
71
  downloader = SingleStoreDownloader(
72
72
  connection_config=connection_config,
@@ -80,7 +80,7 @@ async def test_singlestore_source(temp_dir: Path, source_database_setup: dict):
80
80
  configs=SourceValidationConfigs(
81
81
  test_id="singlestore",
82
82
  expected_num_files=SEED_DATA_ROWS,
83
- expected_number_indexed_file_data=4,
83
+ expected_number_indexed_file_data=2,
84
84
  validate_downloaded_files=True,
85
85
  ),
86
86
  )
@@ -103,7 +103,11 @@ def validate_destination(
103
103
  @pytest.mark.asyncio
104
104
  @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
105
105
  async def test_singlestore_destination(upload_file: Path, temp_dir: Path):
106
- mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
106
+ mock_file_data = FileData(
107
+ identifier="mock file data",
108
+ connector_type=CONNECTOR_TYPE,
109
+ source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
110
+ )
107
111
  with docker_compose_context(
108
112
  docker_compose_path=env_setup_path / "sql" / "singlestore" / "destination"
109
113
  ):
@@ -17,7 +17,7 @@ from test.integration.connectors.utils.validation.source import (
17
17
  source_connector_validation,
18
18
  )
19
19
  from test.integration.utils import requires_env
20
- from unstructured_ingest.v2.interfaces import FileData
20
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
21
21
  from unstructured_ingest.v2.processes.connectors.sql.snowflake import (
22
22
  CONNECTOR_TYPE,
23
23
  SnowflakeAccessConfig,
@@ -170,7 +170,11 @@ async def test_snowflake_destination(
170
170
  ):
171
171
  # the postgres destination connector doesn't leverage the file data but is required as an input,
172
172
  # mocking it with arbitrary values to meet the base requirements:
173
- mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
173
+ mock_file_data = FileData(
174
+ identifier="mock file data",
175
+ connector_type=CONNECTOR_TYPE,
176
+ source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
177
+ )
174
178
  init_db_destination()
175
179
  stager = SnowflakeUploadStager()
176
180
  staged_path = stager.run(
@@ -15,7 +15,7 @@ from test.integration.connectors.utils.validation.source import (
15
15
  SourceValidationConfigs,
16
16
  source_connector_validation,
17
17
  )
18
- from unstructured_ingest.v2.interfaces import FileData
18
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
19
19
  from unstructured_ingest.v2.processes.connectors.sql.sqlite import (
20
20
  CONNECTOR_TYPE,
21
21
  SQLiteConnectionConfig,
@@ -27,7 +27,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sqlite import (
27
27
  SQLiteUploadStager,
28
28
  )
29
29
 
30
- SEED_DATA_ROWS = 20
30
+ SEED_DATA_ROWS = 10
31
31
 
32
32
 
33
33
  @pytest.fixture
@@ -57,7 +57,7 @@ async def test_sqlite_source(source_database_setup: Path, temp_dir: Path):
57
57
  connection_config = SQLiteConnectionConfig(database_path=source_database_setup)
58
58
  indexer = SQLiteIndexer(
59
59
  connection_config=connection_config,
60
- index_config=SQLiteIndexerConfig(table_name="cars", id_column="car_id", batch_size=5),
60
+ index_config=SQLiteIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
61
61
  )
62
62
  downloader = SQLiteDownloader(
63
63
  connection_config=connection_config,
@@ -69,7 +69,7 @@ async def test_sqlite_source(source_database_setup: Path, temp_dir: Path):
69
69
  configs=SourceValidationConfigs(
70
70
  test_id="sqlite",
71
71
  expected_num_files=SEED_DATA_ROWS,
72
- expected_number_indexed_file_data=4,
72
+ expected_number_indexed_file_data=2,
73
73
  validate_downloaded_files=True,
74
74
  ),
75
75
  )
@@ -116,7 +116,11 @@ async def test_sqlite_destination(
116
116
  ):
117
117
  # the sqlite destination connector doesn't leverage the file data but is required as an input,
118
118
  # mocking it with arbitrary values to meet the base requirements:
119
- mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
119
+ mock_file_data = FileData(
120
+ identifier="mock file data",
121
+ connector_type=CONNECTOR_TYPE,
122
+ source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
123
+ )
120
124
  stager = SQLiteUploadStager()
121
125
  staged_path = stager.run(
122
126
  elements_filepath=upload_file,
@@ -14,12 +14,18 @@ from test.integration.connectors.utils.validation.destination import (
14
14
  StagerValidationConfigs,
15
15
  stager_validation,
16
16
  )
17
+ from test.integration.connectors.utils.validation.source import (
18
+ SourceValidationConfigs,
19
+ source_connector_validation,
20
+ )
17
21
  from test.integration.utils import requires_env
18
22
  from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
19
23
  from unstructured_ingest.v2.processes.connectors.astradb import (
20
24
  CONNECTOR_TYPE,
21
25
  AstraDBAccessConfig,
22
26
  AstraDBConnectionConfig,
27
+ AstraDBDownloader,
28
+ AstraDBDownloaderConfig,
23
29
  AstraDBIndexer,
24
30
  AstraDBIndexerConfig,
25
31
  AstraDBUploader,
@@ -110,6 +116,40 @@ def collection(upload_file: Path) -> Collection:
110
116
  astra_db.drop_collection(collection)
111
117
 
112
118
 
119
+ @pytest.mark.asyncio
120
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
121
+ @requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
122
+ async def test_astra_search_source(
123
+ tmp_path: Path,
124
+ ):
125
+ env_data = get_env_data()
126
+ collection_name = "ingest_test_src"
127
+ connection_config = AstraDBConnectionConfig(
128
+ access_config=AstraDBAccessConfig(token=env_data.token, api_endpoint=env_data.api_endpoint)
129
+ )
130
+ indexer = AstraDBIndexer(
131
+ index_config=AstraDBIndexerConfig(
132
+ collection_name=collection_name,
133
+ ),
134
+ connection_config=connection_config,
135
+ )
136
+ downloader = AstraDBDownloader(
137
+ connection_config=connection_config,
138
+ download_config=AstraDBDownloaderConfig(download_dir=tmp_path),
139
+ )
140
+
141
+ await source_connector_validation(
142
+ indexer=indexer,
143
+ downloader=downloader,
144
+ configs=SourceValidationConfigs(
145
+ test_id=CONNECTOR_TYPE,
146
+ expected_num_files=5,
147
+ expected_number_indexed_file_data=1,
148
+ validate_downloaded_files=True,
149
+ ),
150
+ )
151
+
152
+
113
153
  @pytest.mark.asyncio
114
154
  @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
115
155
  @requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
@@ -122,7 +122,7 @@ async def test_kafka_source_local(kafka_seed_topic: str):
122
122
  indexer=indexer,
123
123
  downloader=downloader,
124
124
  configs=SourceValidationConfigs(
125
- test_id="kafka", expected_num_files=5, validate_downloaded_files=True
125
+ test_id="kafka-local", expected_num_files=5, validate_downloaded_files=True
126
126
  ),
127
127
  )
128
128
 
@@ -204,7 +204,7 @@ async def test_kafka_source_cloud(kafka_seed_topic_cloud: int):
204
204
  indexer=indexer,
205
205
  downloader=downloader,
206
206
  configs=SourceValidationConfigs(
207
- test_id="kafka",
207
+ test_id="kafka-cloud",
208
208
  exclude_fields_extend=["connector_type"],
209
209
  expected_num_files=expected_messages,
210
210
  validate_downloaded_files=True,
@@ -197,7 +197,10 @@ async def test_mongodb_source(temp_dir: Path):
197
197
  indexer=indexer,
198
198
  downloader=downloader,
199
199
  configs=SourceValidationConfigs(
200
- test_id=CONNECTOR_TYPE, expected_num_files=4, validate_downloaded_files=True
200
+ test_id=CONNECTOR_TYPE,
201
+ expected_num_files=4,
202
+ validate_downloaded_files=True,
203
+ expected_number_indexed_file_data=1,
201
204
  ),
202
205
  )
203
206
 
@@ -1,14 +1,13 @@
1
1
  import json
2
2
  import os
3
3
  import shutil
4
- from dataclasses import replace
5
4
  from pathlib import Path
6
5
  from typing import Callable, Optional
7
6
 
8
7
  from deepdiff import DeepDiff
9
8
  from pydantic import Field
10
9
 
11
- from test.integration.connectors.utils.validation.utils import ValidationConfig, reset_dir
10
+ from test.integration.connectors.utils.validation.utils import ValidationConfig
12
11
  from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
13
12
 
14
13
 
@@ -92,7 +91,7 @@ def check_contents(
92
91
  file_data_path = expected_output_dir / f"{file_data.identifier}.json"
93
92
  with file_data_path.open("r") as file:
94
93
  expected_file_data_contents = json.load(file)
95
- current_file_data_contents = file_data.to_dict()
94
+ current_file_data_contents = file_data.model_dump()
96
95
  expected_file_data_contents = configs.omit_ignored_fields(expected_file_data_contents)
97
96
  current_file_data_contents = configs.omit_ignored_fields(current_file_data_contents)
98
97
  diff = DeepDiff(expected_file_data_contents, current_file_data_contents)
@@ -160,9 +159,11 @@ def update_fixtures(
160
159
  save_filedata: bool = True,
161
160
  ):
162
161
  # Rewrite the current file data
162
+ if not output_dir.exists():
163
+ output_dir.mkdir(parents=True)
163
164
  if save_filedata:
164
165
  file_data_output_path = output_dir / "file_data"
165
- reset_dir(dir_path=file_data_output_path)
166
+ shutil.rmtree(path=file_data_output_path, ignore_errors=True)
166
167
  print(
167
168
  f"Writing {len(all_file_data)} file data to "
168
169
  f"saved fixture location {file_data_output_path}"
@@ -171,7 +172,7 @@ def update_fixtures(
171
172
  for file_data in all_file_data:
172
173
  file_data_path = file_data_output_path / f"{file_data.identifier}.json"
173
174
  with file_data_path.open(mode="w") as f:
174
- json.dump(file_data.to_dict(), f, indent=2)
175
+ json.dump(file_data.model_dump(), f, indent=2)
175
176
 
176
177
  # Record file structure of download directory
177
178
  download_files = get_files(dir_path=download_dir)
@@ -183,7 +184,7 @@ def update_fixtures(
183
184
  # If applicable, save raw downloads
184
185
  if save_downloads:
185
186
  raw_download_output_path = output_dir / "downloads"
186
- reset_dir(raw_download_output_path)
187
+ shutil.rmtree(path=raw_download_output_path, ignore_errors=True)
187
188
  print(
188
189
  f"Writing {len(download_files)} downloaded files to "
189
190
  f"saved fixture location {raw_download_output_path}"
@@ -213,7 +214,10 @@ def run_all_validations(
213
214
  if configs.validate_file_data:
214
215
  run_expected_results_validation(
215
216
  expected_output_dir=test_output_dir / "file_data",
216
- all_file_data=postdownload_file_data,
217
+ all_file_data=get_all_file_data(
218
+ all_predownload_file_data=predownload_file_data,
219
+ all_postdownload_file_data=postdownload_file_data,
220
+ ),
217
221
  configs=configs,
218
222
  )
219
223
  download_files = get_files(dir_path=download_dir)
@@ -229,6 +233,19 @@ def run_all_validations(
229
233
  )
230
234
 
231
235
 
236
+ def get_all_file_data(
237
+ all_postdownload_file_data: list[FileData], all_predownload_file_data: list[FileData]
238
+ ) -> list[FileData]:
239
+ all_file_data = all_postdownload_file_data
240
+ indexed_file_data = [
241
+ fd
242
+ for fd in all_predownload_file_data
243
+ if fd.identifier not in [f.identifier for f in all_file_data]
244
+ ]
245
+ all_file_data += indexed_file_data
246
+ return all_file_data
247
+
248
+
232
249
  async def source_connector_validation(
233
250
  indexer: Indexer,
234
251
  downloader: Downloader,
@@ -246,7 +263,7 @@ async def source_connector_validation(
246
263
  test_output_dir = configs.test_output_dir()
247
264
  for file_data in indexer.run():
248
265
  assert file_data
249
- predownload_file_data = replace(file_data)
266
+ predownload_file_data = file_data.model_copy(deep=True)
250
267
  all_predownload_file_data.append(predownload_file_data)
251
268
  if downloader.is_async():
252
269
  resp = await downloader.run_async(file_data=file_data)
@@ -254,10 +271,10 @@ async def source_connector_validation(
254
271
  resp = downloader.run(file_data=file_data)
255
272
  if isinstance(resp, list):
256
273
  for r in resp:
257
- postdownload_file_data = replace(r["file_data"])
274
+ postdownload_file_data = r["file_data"].model_copy(deep=True)
258
275
  all_postdownload_file_data.append(postdownload_file_data)
259
276
  else:
260
- postdownload_file_data = replace(resp["file_data"])
277
+ postdownload_file_data = resp["file_data"].model_copy(deep=True)
261
278
  all_postdownload_file_data.append(postdownload_file_data)
262
279
  if not overwrite_fixtures:
263
280
  print("Running validation")
@@ -273,7 +290,10 @@ async def source_connector_validation(
273
290
  update_fixtures(
274
291
  output_dir=test_output_dir,
275
292
  download_dir=download_dir,
276
- all_file_data=all_postdownload_file_data,
293
+ all_file_data=get_all_file_data(
294
+ all_predownload_file_data=all_predownload_file_data,
295
+ all_postdownload_file_data=all_postdownload_file_data,
296
+ ),
277
297
  save_downloads=configs.validate_downloaded_files,
278
298
  save_filedata=configs.validate_file_data,
279
299
  )
@@ -1 +1 @@
1
- __version__ = "0.3.9" # pragma: no cover
1
+ __version__ = "0.3.11" # pragma: no cover
@@ -1,6 +1,6 @@
1
1
  from .connector import AccessConfig, BaseConnector, ConnectionConfig
2
2
  from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
3
- from .file_data import FileData, FileDataSourceMetadata, SourceIdentifiers
3
+ from .file_data import BatchFileData, BatchItem, FileData, FileDataSourceMetadata, SourceIdentifiers
4
4
  from .indexer import Indexer, IndexerConfig
5
5
  from .process import BaseProcess
6
6
  from .processor import ProcessorConfig
@@ -27,4 +27,6 @@ __all__ = [
27
27
  "ConnectionConfig",
28
28
  "BaseConnector",
29
29
  "FileDataSourceMetadata",
30
+ "BatchFileData",
31
+ "BatchItem",
30
32
  ]
@@ -1,13 +1,14 @@
1
1
  import json
2
- from dataclasses import dataclass, field
3
2
  from pathlib import Path
4
- from typing import Any, Literal, Optional
3
+ from typing import Any, Optional
4
+ from uuid import NAMESPACE_DNS, uuid5
5
5
 
6
- from dataclasses_json import DataClassJsonMixin
6
+ from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
7
7
 
8
+ from unstructured_ingest.v2.logger import logger
8
9
 
9
- @dataclass
10
- class SourceIdentifiers:
10
+
11
+ class SourceIdentifiers(BaseModel):
11
12
  filename: str
12
13
  fullpath: str
13
14
  rel_path: Optional[str] = None
@@ -21,8 +22,7 @@ class SourceIdentifiers:
21
22
  return self.rel_path or self.fullpath
22
23
 
23
24
 
24
- @dataclass
25
- class FileDataSourceMetadata(DataClassJsonMixin):
25
+ class FileDataSourceMetadata(BaseModel):
26
26
  url: Optional[str] = None
27
27
  version: Optional[str] = None
28
28
  record_locator: Optional[dict[str, Any]] = None
@@ -33,14 +33,12 @@ class FileDataSourceMetadata(DataClassJsonMixin):
33
33
  filesize_bytes: Optional[int] = None
34
34
 
35
35
 
36
- @dataclass
37
- class FileData(DataClassJsonMixin):
36
+ class FileData(BaseModel):
38
37
  identifier: str
39
38
  connector_type: str
40
- source_identifiers: Optional[SourceIdentifiers] = None
41
- doc_type: Literal["file", "batch"] = field(default="file")
42
- metadata: FileDataSourceMetadata = field(default_factory=lambda: FileDataSourceMetadata())
43
- additional_metadata: dict[str, Any] = field(default_factory=dict)
39
+ source_identifiers: SourceIdentifiers
40
+ metadata: FileDataSourceMetadata = Field(default_factory=lambda: FileDataSourceMetadata())
41
+ additional_metadata: dict[str, Any] = Field(default_factory=dict)
44
42
  reprocess: bool = False
45
43
  local_download_path: Optional[str] = None
46
44
  display_name: Optional[str] = None
@@ -52,11 +50,67 @@ class FileData(DataClassJsonMixin):
52
50
  raise ValueError(f"file path not valid: {path}")
53
51
  with open(str(path.resolve()), "rb") as f:
54
52
  file_data_dict = json.load(f)
55
- file_data = FileData.from_dict(file_data_dict)
53
+ file_data = cls.model_validate(file_data_dict)
56
54
  return file_data
57
55
 
56
+ @classmethod
57
+ def cast(cls, file_data: "FileData", **kwargs) -> "FileData":
58
+ file_data_dict = file_data.model_dump()
59
+ return cls.model_validate(file_data_dict, **kwargs)
60
+
58
61
  def to_file(self, path: str) -> None:
59
62
  path = Path(path).resolve()
60
63
  path.parent.mkdir(parents=True, exist_ok=True)
61
64
  with open(str(path.resolve()), "w") as f:
62
- json.dump(self.to_dict(), f, indent=2)
65
+ json.dump(self.model_dump(), f, indent=2)
66
+
67
+
68
+ class BatchItem(BaseModel):
69
+ identifier: str
70
+ version: Optional[str] = None
71
+
72
+
73
+ class BatchFileData(FileData):
74
+ identifier: str = Field(init=False)
75
+ batch_items: list[BatchItem]
76
+ source_identifiers: Optional[SourceIdentifiers] = None
77
+
78
+ @field_validator("batch_items")
79
+ @classmethod
80
+ def check_batch_items(cls, v: list[BatchItem]) -> list[BatchItem]:
81
+ if not v:
82
+ raise ValueError("batch items cannot be empty")
83
+ all_identifiers = [item.identifier for item in v]
84
+ if len(all_identifiers) != len(set(all_identifiers)):
85
+ raise ValueError(f"duplicate identifiers: {all_identifiers}")
86
+ sorted_batch_items = sorted(v, key=lambda item: item.identifier)
87
+ return sorted_batch_items
88
+
89
+ @model_validator(mode="before")
90
+ @classmethod
91
+ def populate_identifier(cls, data: Any) -> Any:
92
+ if isinstance(data, dict) and "identifier" not in data:
93
+ batch_items = data["batch_items"]
94
+ identifier_data = json.dumps(
95
+ {item.identifier: item.version for item in batch_items}, sort_keys=True
96
+ )
97
+ data["identifier"] = str(uuid5(NAMESPACE_DNS, str(identifier_data)))
98
+ return data
99
+
100
+
101
+ def file_data_from_file(path: str) -> FileData:
102
+ try:
103
+ return BatchFileData.from_file(path=path)
104
+ except ValidationError:
105
+ logger.debug(f"{path} not valid for batch file data")
106
+
107
+ return FileData.from_file(path=path)
108
+
109
+
110
+ def file_data_from_dict(data: dict) -> FileData:
111
+ try:
112
+ return BatchFileData.model_validate(data)
113
+ except ValidationError:
114
+ logger.debug(f"{data} not valid for batch file data")
115
+
116
+ return FileData.model_validate(data)