unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (187) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/main.py +0 -0
  7. unstructured_ingest/pipeline/interfaces.py +1 -1
  8. unstructured_ingest/pipeline/pipeline.py +1 -1
  9. unstructured_ingest/processes/chunker.py +4 -0
  10. unstructured_ingest/processes/connectors/airtable.py +4 -2
  11. unstructured_ingest/processes/connectors/astradb.py +2 -2
  12. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  13. unstructured_ingest/processes/connectors/confluence.py +0 -1
  14. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  15. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  16. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  18. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  19. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  20. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  21. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  22. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  23. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  24. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  25. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  26. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  27. unstructured_ingest/processes/connectors/outlook.py +1 -2
  28. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  29. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  30. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  31. unstructured_ingest/processes/connectors/slack.py +1 -2
  32. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  33. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  34. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  35. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  37. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  38. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  39. unstructured_ingest/processes/connectors/vectara.py +0 -2
  40. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  41. unstructured_ingest/processes/embedder.py +2 -2
  42. unstructured_ingest/processes/filter.py +1 -1
  43. unstructured_ingest/processes/partitioner.py +4 -0
  44. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  45. unstructured_ingest/unstructured_api.py +13 -8
  46. unstructured_ingest/utils/data_prep.py +8 -32
  47. unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
  48. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +50 -184
  49. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
  50. examples/__init__.py +0 -0
  51. examples/airtable.py +0 -44
  52. examples/azure_cognitive_search.py +0 -55
  53. examples/chroma.py +0 -54
  54. examples/couchbase.py +0 -55
  55. examples/databricks_volumes_dest.py +0 -55
  56. examples/databricks_volumes_source.py +0 -53
  57. examples/delta_table.py +0 -45
  58. examples/discord_example.py +0 -36
  59. examples/elasticsearch.py +0 -49
  60. examples/google_drive.py +0 -45
  61. examples/kdbai.py +0 -54
  62. examples/local.py +0 -36
  63. examples/milvus.py +0 -44
  64. examples/mongodb.py +0 -53
  65. examples/opensearch.py +0 -50
  66. examples/pinecone.py +0 -57
  67. examples/s3.py +0 -38
  68. examples/salesforce.py +0 -44
  69. examples/sharepoint.py +0 -47
  70. examples/singlestore.py +0 -49
  71. examples/sql.py +0 -90
  72. examples/vectara.py +0 -54
  73. examples/weaviate.py +0 -44
  74. test/__init__.py +0 -0
  75. test/integration/__init__.py +0 -0
  76. test/integration/chunkers/__init__.py +0 -0
  77. test/integration/chunkers/test_chunkers.py +0 -31
  78. test/integration/connectors/__init__.py +0 -0
  79. test/integration/connectors/conftest.py +0 -38
  80. test/integration/connectors/databricks/__init__.py +0 -0
  81. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  82. test/integration/connectors/discord/__init__.py +0 -0
  83. test/integration/connectors/discord/test_discord.py +0 -90
  84. test/integration/connectors/duckdb/__init__.py +0 -0
  85. test/integration/connectors/duckdb/conftest.py +0 -14
  86. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  87. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  88. test/integration/connectors/elasticsearch/__init__.py +0 -0
  89. test/integration/connectors/elasticsearch/conftest.py +0 -34
  90. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  91. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  92. test/integration/connectors/sql/__init__.py +0 -0
  93. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  94. test/integration/connectors/sql/test_postgres.py +0 -201
  95. test/integration/connectors/sql/test_singlestore.py +0 -182
  96. test/integration/connectors/sql/test_snowflake.py +0 -244
  97. test/integration/connectors/sql/test_sqlite.py +0 -168
  98. test/integration/connectors/sql/test_vastdb.py +0 -34
  99. test/integration/connectors/test_astradb.py +0 -287
  100. test/integration/connectors/test_azure_ai_search.py +0 -254
  101. test/integration/connectors/test_chroma.py +0 -136
  102. test/integration/connectors/test_confluence.py +0 -111
  103. test/integration/connectors/test_delta_table.py +0 -183
  104. test/integration/connectors/test_dropbox.py +0 -151
  105. test/integration/connectors/test_github.py +0 -49
  106. test/integration/connectors/test_google_drive.py +0 -257
  107. test/integration/connectors/test_jira.py +0 -67
  108. test/integration/connectors/test_lancedb.py +0 -247
  109. test/integration/connectors/test_milvus.py +0 -208
  110. test/integration/connectors/test_mongodb.py +0 -335
  111. test/integration/connectors/test_neo4j.py +0 -244
  112. test/integration/connectors/test_notion.py +0 -152
  113. test/integration/connectors/test_onedrive.py +0 -163
  114. test/integration/connectors/test_pinecone.py +0 -387
  115. test/integration/connectors/test_qdrant.py +0 -216
  116. test/integration/connectors/test_redis.py +0 -143
  117. test/integration/connectors/test_s3.py +0 -184
  118. test/integration/connectors/test_sharepoint.py +0 -222
  119. test/integration/connectors/test_vectara.py +0 -282
  120. test/integration/connectors/test_zendesk.py +0 -120
  121. test/integration/connectors/utils/__init__.py +0 -0
  122. test/integration/connectors/utils/constants.py +0 -13
  123. test/integration/connectors/utils/docker.py +0 -151
  124. test/integration/connectors/utils/docker_compose.py +0 -59
  125. test/integration/connectors/utils/validation/__init__.py +0 -0
  126. test/integration/connectors/utils/validation/destination.py +0 -77
  127. test/integration/connectors/utils/validation/equality.py +0 -76
  128. test/integration/connectors/utils/validation/source.py +0 -331
  129. test/integration/connectors/utils/validation/utils.py +0 -36
  130. test/integration/connectors/weaviate/__init__.py +0 -0
  131. test/integration/connectors/weaviate/conftest.py +0 -15
  132. test/integration/connectors/weaviate/test_cloud.py +0 -39
  133. test/integration/connectors/weaviate/test_local.py +0 -152
  134. test/integration/embedders/__init__.py +0 -0
  135. test/integration/embedders/conftest.py +0 -13
  136. test/integration/embedders/test_azure_openai.py +0 -57
  137. test/integration/embedders/test_bedrock.py +0 -103
  138. test/integration/embedders/test_huggingface.py +0 -24
  139. test/integration/embedders/test_mixedbread.py +0 -71
  140. test/integration/embedders/test_octoai.py +0 -75
  141. test/integration/embedders/test_openai.py +0 -74
  142. test/integration/embedders/test_togetherai.py +0 -71
  143. test/integration/embedders/test_vertexai.py +0 -63
  144. test/integration/embedders/test_voyageai.py +0 -79
  145. test/integration/embedders/utils.py +0 -66
  146. test/integration/partitioners/__init__.py +0 -0
  147. test/integration/partitioners/test_partitioner.py +0 -76
  148. test/integration/utils.py +0 -15
  149. test/unit/__init__.py +0 -0
  150. test/unit/chunkers/__init__.py +0 -0
  151. test/unit/chunkers/test_chunkers.py +0 -49
  152. test/unit/connectors/__init__.py +0 -0
  153. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  154. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  155. test/unit/connectors/motherduck/__init__.py +0 -0
  156. test/unit/connectors/motherduck/test_base.py +0 -73
  157. test/unit/connectors/sql/__init__.py +0 -0
  158. test/unit/connectors/sql/test_sql.py +0 -152
  159. test/unit/connectors/test_confluence.py +0 -71
  160. test/unit/connectors/test_jira.py +0 -401
  161. test/unit/embed/__init__.py +0 -0
  162. test/unit/embed/test_mixedbreadai.py +0 -42
  163. test/unit/embed/test_octoai.py +0 -27
  164. test/unit/embed/test_openai.py +0 -28
  165. test/unit/embed/test_vertexai.py +0 -25
  166. test/unit/embed/test_voyageai.py +0 -24
  167. test/unit/embedders/__init__.py +0 -0
  168. test/unit/embedders/test_bedrock.py +0 -36
  169. test/unit/embedders/test_huggingface.py +0 -48
  170. test/unit/embedders/test_mixedbread.py +0 -37
  171. test/unit/embedders/test_octoai.py +0 -35
  172. test/unit/embedders/test_openai.py +0 -35
  173. test/unit/embedders/test_togetherai.py +0 -37
  174. test/unit/embedders/test_vertexai.py +0 -37
  175. test/unit/embedders/test_voyageai.py +0 -38
  176. test/unit/partitioners/__init__.py +0 -0
  177. test/unit/partitioners/test_partitioner.py +0 -63
  178. test/unit/test_error.py +0 -27
  179. test/unit/test_html.py +0 -112
  180. test/unit/test_interfaces.py +0 -26
  181. test/unit/test_utils.py +0 -220
  182. test/unit/utils/__init__.py +0 -0
  183. test/unit/utils/data_generator.py +0 -32
  184. unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
  185. unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
  186. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
  187. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
@@ -1,77 +0,0 @@
1
- import os
2
- import shutil
3
- from pathlib import Path
4
-
5
- from test.integration.connectors.utils.validation.utils import ValidationConfig
6
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
7
- from unstructured_ingest.interfaces import UploadStager
8
- from unstructured_ingest.utils.data_prep import get_data
9
-
10
-
11
- class StagerValidationConfigs(ValidationConfig):
12
- expected_count: int
13
- expected_folder: str = "stager"
14
-
15
- def stager_output_dir(self) -> Path:
16
- dir = self.test_output_dir() / self.expected_folder
17
- dir.mkdir(exist_ok=True, parents=True)
18
- return dir
19
-
20
- def stager_output_path(self, input_path: Path) -> Path:
21
- return self.stager_output_dir() / input_path.name
22
-
23
-
24
- def run_all_stager_validations(
25
- configs: StagerValidationConfigs, input_file: Path, staged_filepath: Path
26
- ):
27
- # Validate matching extensions
28
- assert input_file.suffix == staged_filepath.suffix
29
-
30
- # Validate length
31
- staged_data = get_data(path=staged_filepath)
32
- assert len(staged_data) == configs.expected_count
33
-
34
- # Validate file
35
- expected_filepath = configs.stager_output_path(input_path=input_file)
36
- assert expected_filepath.exists(), f"{expected_filepath} does not exist"
37
- assert expected_filepath.is_file(), f"{expected_filepath} is not a file"
38
- if configs.detect_diff(expected_filepath=expected_filepath, current_filepath=staged_filepath):
39
- raise AssertionError(
40
- f"Current file ({staged_filepath}) does not match expected file: {expected_filepath}"
41
- )
42
-
43
-
44
- def update_stager_fixtures(stager_output_path: Path, staged_filepath: Path):
45
- copied_filepath = stager_output_path / staged_filepath.name
46
- shutil.copy(staged_filepath, copied_filepath)
47
-
48
-
49
- def stager_validation(
50
- stager: UploadStager,
51
- tmp_dir: Path,
52
- input_file: Path,
53
- configs: StagerValidationConfigs,
54
- overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
55
- ) -> None:
56
- # Run stager
57
- file_data = FileData(
58
- source_identifiers=SourceIdentifiers(fullpath=input_file.name, filename=input_file.name),
59
- connector_type=configs.test_id,
60
- identifier="mock file data",
61
- )
62
- staged_filepath = stager.run(
63
- elements_filepath=input_file,
64
- file_data=file_data,
65
- output_dir=tmp_dir,
66
- output_filename=input_file.name,
67
- )
68
- if not overwrite_fixtures:
69
- print("Running validation")
70
- run_all_stager_validations(
71
- configs=configs, input_file=input_file, staged_filepath=staged_filepath
72
- )
73
- else:
74
- print("Running fixtures update")
75
- update_stager_fixtures(
76
- stager_output_path=configs.stager_output_dir(), staged_filepath=staged_filepath
77
- )
@@ -1,76 +0,0 @@
1
- import json
2
- from pathlib import Path
3
-
4
- from bs4 import BeautifulSoup
5
- from deepdiff import DeepDiff
6
-
7
- from unstructured_ingest.utils import ndjson
8
-
9
-
10
- def json_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
11
- with expected_filepath.open() as f:
12
- expected_data = json.load(f)
13
- with current_filepath.open() as f:
14
- current_data = json.load(f)
15
- diff = DeepDiff(expected_data, current_data)
16
- if diff:
17
- print("diff between expected and current json")
18
- print(diff.to_json(indent=2))
19
- return False
20
- return True
21
-
22
-
23
- def ndjson_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
24
- with expected_filepath.open() as f:
25
- expected_data = ndjson.load(f)
26
- with current_filepath.open() as f:
27
- current_data = ndjson.load(f)
28
- if len(current_data) != len(expected_data):
29
- print(
30
- f"expected data length {len(expected_data)} "
31
- f"didn't match current results: {len(current_data)}"
32
- )
33
- for i in range(len(expected_data)):
34
- e = expected_data[i]
35
- r = current_data[i]
36
- if e != r:
37
- print(f"{i}th element doesn't match:\nexpected {e}\ncurrent {r}")
38
- return False
39
- return True
40
-
41
-
42
- def html_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
43
- with expected_filepath.open() as expected_f:
44
- expected_soup = BeautifulSoup(expected_f, "html.parser")
45
- with current_filepath.open() as current_f:
46
- current_soup = BeautifulSoup(current_f, "html.parser")
47
- return expected_soup.text == current_soup.text
48
-
49
-
50
- def txt_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
51
- with expected_filepath.open() as expected_f:
52
- expected_text_lines = expected_f.readlines()
53
- with current_filepath.open() as current_f:
54
- current_text_lines = current_f.readlines()
55
- if len(expected_text_lines) != len(current_text_lines):
56
- print(
57
- f"Lines in expected text file ({len(expected_text_lines)}) "
58
- f"don't match current text file ({len(current_text_lines)})"
59
- )
60
- return False
61
- expected_text = "\n".join(expected_text_lines)
62
- current_text = "\n".join(current_text_lines)
63
- if expected_text == current_text:
64
- return True
65
- print("txt content don't match:")
66
- print(f"expected: {expected_text}")
67
- print(f"current: {current_text}")
68
- return False
69
-
70
-
71
- file_type_equality_check = {
72
- ".json": json_equality_check,
73
- ".ndjson": ndjson_equality_check,
74
- ".html": html_equality_check,
75
- ".txt": txt_equality_check,
76
- }
@@ -1,331 +0,0 @@
1
- import json
2
- import os
3
- import shutil
4
- from pathlib import Path
5
- from typing import Callable, Optional
6
-
7
- from deepdiff import DeepDiff
8
- from pydantic import Field
9
-
10
- from test.integration.connectors.utils.validation.utils import ValidationConfig
11
- from unstructured_ingest.data_types.file_data import FileData
12
- from unstructured_ingest.interfaces import Downloader, Indexer
13
-
14
- NONSTANDARD_METADATA_FIELDS = {
15
- "additional_metadata.@microsoft.graph.downloadUrl": [
16
- "additional_metadata",
17
- "@microsoft.graph.downloadUrl",
18
- ]
19
- }
20
-
21
-
22
- class SourceValidationConfigs(ValidationConfig):
23
- expected_number_indexed_file_data: Optional[int] = None
24
- expected_num_files: Optional[int] = None
25
- predownload_file_data_check: Optional[Callable[[FileData], None]] = None
26
- postdownload_file_data_check: Optional[Callable[[FileData], None]] = None
27
- exclude_fields: list[str] = Field(
28
- default_factory=lambda: ["local_download_path", "metadata.date_processed"]
29
- )
30
- exclude_fields_extend: list[str] = Field(default_factory=list)
31
- validate_downloaded_files: bool = False
32
- validate_file_data: bool = True
33
-
34
- def get_exclude_fields(self) -> list[str]:
35
- exclude_fields = self.exclude_fields
36
- exclude_fields.extend(self.exclude_fields_extend)
37
- return list(set(exclude_fields))
38
-
39
- def run_file_data_validation(
40
- self, predownload_file_data: FileData, postdownload_file_data: FileData
41
- ):
42
- if predownload_file_data_check := self.predownload_file_data_check:
43
- predownload_file_data_check(predownload_file_data)
44
- if postdownload_file_data_check := self.postdownload_file_data_check:
45
- postdownload_file_data_check(postdownload_file_data)
46
-
47
- def run_download_dir_validation(self, download_dir: Path):
48
- if expected_num_files := self.expected_num_files:
49
- downloaded_files = [p for p in download_dir.rglob("*") if p.is_file()]
50
- assert len(downloaded_files) == expected_num_files
51
-
52
- def omit_ignored_fields(self, data: dict) -> dict:
53
- exclude_fields = self.get_exclude_fields()
54
- # Ignore fields that dynamically change every time the tests run
55
- copied_data = data.copy()
56
-
57
- for exclude_field in exclude_fields:
58
- exclude_field_vals = (
59
- NONSTANDARD_METADATA_FIELDS[exclude_field]
60
- if exclude_field in NONSTANDARD_METADATA_FIELDS
61
- else exclude_field.split(".")
62
- )
63
- if len(exclude_field_vals) == 1:
64
- current_val = copied_data
65
- drop_field = exclude_field_vals[0]
66
- copied_data.pop(exclude_field_vals[0], None)
67
- else:
68
- current_val = copied_data
69
- for val in exclude_field_vals[:-1]:
70
- current_val = current_val.get(val, {})
71
- drop_field = exclude_field_vals[-1]
72
- if drop_field == "*":
73
- current_val.clear()
74
- else:
75
- current_val.pop(drop_field, None)
76
- return copied_data
77
-
78
-
79
- def get_files(dir_path: Path) -> list[str]:
80
- return [
81
- str(f).replace(str(dir_path), "").lstrip("/") for f in dir_path.rglob("*") if f.is_file()
82
- ]
83
-
84
-
85
- def check_files(expected_output_dir: Path, all_file_data: list[FileData]):
86
- expected_files = get_files(dir_path=expected_output_dir)
87
- current_files = [f"{file_data.identifier}.json" for file_data in all_file_data]
88
- diff = set(expected_files) ^ set(current_files)
89
- assert not diff, "diff in files that exist: {}".format(", ".join(diff))
90
-
91
-
92
- def check_files_in_paths(expected_output_dir: Path, current_output_dir: Path):
93
- expected_files = get_files(dir_path=expected_output_dir)
94
- current_files = get_files(dir_path=current_output_dir)
95
- diff = set(expected_files) ^ set(current_files)
96
- assert not diff, "diff in files that exist: {}".format(", ".join(diff))
97
-
98
-
99
- def check_contents(
100
- expected_output_dir: Path, all_file_data: list[FileData], configs: SourceValidationConfigs
101
- ):
102
- found_diff = False
103
- for file_data in all_file_data:
104
- file_data_path = expected_output_dir / f"{file_data.identifier}.json"
105
- with file_data_path.open("r") as file:
106
- expected_file_data_contents = json.load(file)
107
- current_file_data_contents = json.loads(file_data.model_dump_json())
108
- expected_file_data_contents = configs.omit_ignored_fields(expected_file_data_contents)
109
- current_file_data_contents = configs.omit_ignored_fields(current_file_data_contents)
110
- diff = DeepDiff(expected_file_data_contents, current_file_data_contents)
111
- if diff:
112
- found_diff = True
113
- print(diff.to_json(indent=2))
114
- assert not found_diff, f"Diffs found between files: {found_diff}"
115
-
116
-
117
- def check_raw_file_contents(
118
- expected_output_dir: Path,
119
- current_output_dir: Path,
120
- configs: SourceValidationConfigs,
121
- ):
122
- current_files = get_files(dir_path=current_output_dir)
123
- found_diff = False
124
- files = []
125
- for current_file in current_files:
126
- current_file_path = current_output_dir / current_file
127
- expected_file_path = expected_output_dir / current_file
128
- if configs.detect_diff(expected_file_path, current_file_path):
129
- found_diff = True
130
- files.append(str(expected_file_path))
131
- print(f"diffs between files {expected_file_path} and {current_file_path}")
132
- assert not found_diff, "Diffs found between files: {}".format(", ".join(files))
133
-
134
-
135
- def run_expected_results_validation(
136
- expected_output_dir: Path, all_file_data: list[FileData], configs: SourceValidationConfigs
137
- ):
138
- check_files(expected_output_dir=expected_output_dir, all_file_data=all_file_data)
139
- check_contents(
140
- expected_output_dir=expected_output_dir, all_file_data=all_file_data, configs=configs
141
- )
142
-
143
-
144
- def run_expected_download_files_validation(
145
- expected_output_dir: Path,
146
- current_download_dir: Path,
147
- configs: SourceValidationConfigs,
148
- ):
149
- check_files_in_paths(
150
- expected_output_dir=expected_output_dir, current_output_dir=current_download_dir
151
- )
152
- check_raw_file_contents(
153
- expected_output_dir=expected_output_dir,
154
- current_output_dir=current_download_dir,
155
- configs=configs,
156
- )
157
-
158
-
159
- def run_directory_structure_validation(expected_output_dir: Path, download_files: list[str]):
160
- directory_record = expected_output_dir / "directory_structure.json"
161
- with directory_record.open("r") as directory_file:
162
- directory_file_contents = json.load(directory_file)
163
- directory_structure = directory_file_contents["directory_structure"]
164
- assert directory_structure == download_files
165
-
166
-
167
- def update_fixtures(
168
- output_dir: Path,
169
- download_dir: Path,
170
- all_file_data: list[FileData],
171
- save_downloads: bool = False,
172
- save_filedata: bool = True,
173
- ):
174
- # Rewrite the current file data
175
- if not output_dir.exists():
176
- output_dir.mkdir(parents=True)
177
- if save_filedata:
178
- file_data_output_path = output_dir / "file_data"
179
- shutil.rmtree(path=file_data_output_path, ignore_errors=True)
180
- print(
181
- f"Writing {len(all_file_data)} file data to "
182
- f"saved fixture location {file_data_output_path}"
183
- )
184
- file_data_output_path.mkdir(parents=True, exist_ok=True)
185
- for file_data in all_file_data:
186
- file_data_path = file_data_output_path / f"{file_data.identifier}.json"
187
- with file_data_path.open(mode="w") as f:
188
- f.write(file_data.model_dump_json(indent=2))
189
-
190
- # Record file structure of download directory
191
- download_files = get_files(dir_path=download_dir)
192
- download_files.sort()
193
- download_dir_record = output_dir / "directory_structure.json"
194
- with download_dir_record.open(mode="w") as f:
195
- json.dump({"directory_structure": download_files}, f, indent=2)
196
-
197
- # If applicable, save raw downloads
198
- if save_downloads:
199
- raw_download_output_path = output_dir / "downloads"
200
- shutil.rmtree(path=raw_download_output_path, ignore_errors=True)
201
- print(
202
- f"Writing {len(download_files)} downloaded files to "
203
- f"saved fixture location {raw_download_output_path}"
204
- )
205
- shutil.copytree(download_dir, raw_download_output_path)
206
-
207
-
208
- def run_all_validations(
209
- configs: SourceValidationConfigs,
210
- predownload_file_data: list[FileData],
211
- postdownload_file_data: list[FileData],
212
- download_dir: Path,
213
- test_output_dir: Path,
214
- ):
215
- if expected_number_indexed_file_data := configs.expected_number_indexed_file_data:
216
- assert (
217
- len(predownload_file_data) == expected_number_indexed_file_data
218
- ), f"expected {expected_number_indexed_file_data} but got {len(predownload_file_data)}"
219
- if expected_num_files := configs.expected_num_files:
220
- assert (
221
- len(postdownload_file_data) == expected_num_files
222
- ), f"expected {expected_num_files} but got {len(postdownload_file_data)}"
223
-
224
- for pre_data, post_data in zip(predownload_file_data, postdownload_file_data):
225
- configs.run_file_data_validation(
226
- predownload_file_data=pre_data, postdownload_file_data=post_data
227
- )
228
- configs.run_download_dir_validation(download_dir=download_dir)
229
- if configs.validate_file_data:
230
- run_expected_results_validation(
231
- expected_output_dir=test_output_dir / "file_data",
232
- all_file_data=get_all_file_data(
233
- all_predownload_file_data=predownload_file_data,
234
- all_postdownload_file_data=postdownload_file_data,
235
- ),
236
- configs=configs,
237
- )
238
- download_files = get_files(dir_path=download_dir)
239
- download_files.sort()
240
- run_directory_structure_validation(
241
- expected_output_dir=configs.test_output_dir(), download_files=download_files
242
- )
243
- if configs.validate_downloaded_files:
244
- run_expected_download_files_validation(
245
- expected_output_dir=test_output_dir / "downloads",
246
- current_download_dir=download_dir,
247
- configs=configs,
248
- )
249
-
250
-
251
- def get_all_file_data(
252
- all_postdownload_file_data: list[FileData], all_predownload_file_data: list[FileData]
253
- ) -> list[FileData]:
254
- all_file_data = all_postdownload_file_data
255
- indexed_file_data = [
256
- fd
257
- for fd in all_predownload_file_data
258
- if fd.identifier not in [f.identifier for f in all_file_data]
259
- ]
260
- all_file_data += indexed_file_data
261
- return all_file_data
262
-
263
-
264
- async def source_connector_validation(
265
- indexer: Indexer,
266
- downloader: Downloader,
267
- configs: SourceValidationConfigs,
268
- overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
269
- ) -> None:
270
- # Run common validations on the process of running a source connector, supporting dynamic
271
- # validators that get passed in along with comparisons on the saved expected values.
272
- # If overwrite_fixtures is st to True, will ignore all validators but instead overwrite the
273
- # expected values with what gets generated by this test.
274
- all_predownload_file_data = []
275
- all_postdownload_file_data = []
276
- indexer.precheck()
277
- download_dir = downloader.download_config.download_dir
278
- test_output_dir = configs.test_output_dir()
279
- if indexer.is_async():
280
- async for file_data in indexer.run_async():
281
- assert file_data
282
- predownload_file_data = file_data.model_copy(deep=True)
283
- all_predownload_file_data.append(predownload_file_data)
284
- if downloader.is_async():
285
- resp = await downloader.run_async(file_data=file_data)
286
- else:
287
- resp = downloader.run(file_data=file_data)
288
- if isinstance(resp, list):
289
- for r in resp:
290
- postdownload_file_data = r["file_data"].model_copy(deep=True)
291
- all_postdownload_file_data.append(postdownload_file_data)
292
- else:
293
- postdownload_file_data = resp["file_data"].model_copy(deep=True)
294
- all_postdownload_file_data.append(postdownload_file_data)
295
- else:
296
- for file_data in indexer.run():
297
- assert file_data
298
- predownload_file_data = file_data.model_copy(deep=True)
299
- all_predownload_file_data.append(predownload_file_data)
300
- if downloader.is_async():
301
- resp = await downloader.run_async(file_data=file_data)
302
- else:
303
- resp = downloader.run(file_data=file_data)
304
- if isinstance(resp, list):
305
- for r in resp:
306
- postdownload_file_data = r["file_data"].model_copy(deep=True)
307
- all_postdownload_file_data.append(postdownload_file_data)
308
- else:
309
- postdownload_file_data = resp["file_data"].model_copy(deep=True)
310
- all_postdownload_file_data.append(postdownload_file_data)
311
- if not overwrite_fixtures:
312
- print("Running validation")
313
- run_all_validations(
314
- configs=configs,
315
- predownload_file_data=all_predownload_file_data,
316
- postdownload_file_data=all_postdownload_file_data,
317
- download_dir=download_dir,
318
- test_output_dir=test_output_dir,
319
- )
320
- else:
321
- print("Running fixtures update")
322
- update_fixtures(
323
- output_dir=test_output_dir,
324
- download_dir=download_dir,
325
- all_file_data=get_all_file_data(
326
- all_predownload_file_data=all_predownload_file_data,
327
- all_postdownload_file_data=all_postdownload_file_data,
328
- ),
329
- save_downloads=configs.validate_downloaded_files,
330
- save_filedata=configs.validate_file_data,
331
- )
@@ -1,36 +0,0 @@
1
- import filecmp
2
- import shutil
3
- from pathlib import Path
4
- from typing import Callable, Optional
5
-
6
- from pydantic import BaseModel
7
-
8
- from test.integration.connectors.utils.constants import expected_results_path
9
- from test.integration.connectors.utils.validation.equality import file_type_equality_check
10
-
11
-
12
- class ValidationConfig(BaseModel):
13
- test_id: str
14
- file_equality_check: Optional[Callable[[Path, Path], bool]] = None
15
-
16
- def test_output_dir(self) -> Path:
17
- return expected_results_path / self.test_id
18
-
19
- def detect_diff(self, expected_filepath: Path, current_filepath: Path) -> bool:
20
- if expected_filepath.suffix != current_filepath.suffix:
21
- return True
22
- if file_equality_check := self.file_equality_check:
23
- return not file_equality_check(expected_filepath, current_filepath)
24
- current_suffix = expected_filepath.suffix
25
- if current_suffix in file_type_equality_check:
26
- equality_check_callable = file_type_equality_check[current_suffix]
27
- return not equality_check_callable(
28
- expected_filepath=expected_filepath, current_filepath=current_filepath
29
- )
30
- # Fallback is using filecmp.cmp to compare the files
31
- return not filecmp.cmp(expected_filepath, current_filepath, shallow=False)
32
-
33
-
34
- def reset_dir(dir_path: Path) -> None:
35
- shutil.rmtree(path=dir_path, ignore_errors=True)
36
- dir_path.mkdir(parents=True)
File without changes
@@ -1,15 +0,0 @@
1
- import json
2
- from pathlib import Path
3
-
4
- import pytest
5
-
6
-
7
- @pytest.fixture
8
- def collections_schema_config() -> dict:
9
- int_test_dir = Path(__file__).parent
10
- assets_dir = int_test_dir / "assets"
11
- config_file = assets_dir / "elements.json"
12
- assert config_file.exists()
13
- assert config_file.is_file()
14
- with config_file.open() as config_data:
15
- return json.load(config_data)
@@ -1,39 +0,0 @@
1
- import pytest
2
- from pydantic import ValidationError
3
-
4
- from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
5
- from unstructured_ingest.processes.connectors.weaviate.cloud import (
6
- CONNECTOR_TYPE,
7
- CloudWeaviateAccessConfig,
8
- CloudWeaviateConnectionConfig,
9
- )
10
-
11
-
12
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
13
- def test_weaviate_failing_connection_config():
14
- with pytest.raises(ValidationError):
15
- CloudWeaviateConnectionConfig(
16
- access_config=CloudWeaviateAccessConfig(api_key="my key", password="password"),
17
- username="username",
18
- cluster_url="clusterurl",
19
- )
20
-
21
-
22
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
23
- def test_weaviate_connection_config_happy_path():
24
- CloudWeaviateConnectionConfig(
25
- access_config=CloudWeaviateAccessConfig(
26
- api_key="my key",
27
- ),
28
- cluster_url="clusterurl",
29
- )
30
-
31
-
32
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
33
- def test_weaviate_connection_config_anonymous():
34
- CloudWeaviateConnectionConfig(
35
- access_config=CloudWeaviateAccessConfig(api_key="my key", password="password"),
36
- username="username",
37
- anonymous=True,
38
- cluster_url="clusterurl",
39
- )