unstructured-ingest 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (44) hide show
  1. test/integration/connectors/sql/test_singlestore.py +156 -0
  2. test/integration/connectors/test_confluence.py +113 -0
  3. test/integration/connectors/test_kafka.py +67 -0
  4. test/integration/connectors/test_onedrive.py +112 -0
  5. test/integration/connectors/test_qdrant.py +137 -0
  6. test/integration/connectors/test_s3.py +1 -1
  7. test/integration/connectors/utils/docker.py +2 -1
  8. test/integration/connectors/utils/docker_compose.py +23 -8
  9. test/integration/connectors/utils/validation.py +73 -22
  10. unstructured_ingest/__version__.py +1 -1
  11. unstructured_ingest/connector/kafka.py +0 -1
  12. unstructured_ingest/interfaces.py +7 -7
  13. unstructured_ingest/v2/interfaces/file_data.py +1 -0
  14. unstructured_ingest/v2/processes/chunker.py +2 -2
  15. unstructured_ingest/v2/processes/connectors/__init__.py +15 -7
  16. unstructured_ingest/v2/processes/connectors/astradb.py +278 -55
  17. unstructured_ingest/v2/processes/connectors/confluence.py +195 -0
  18. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +5 -5
  19. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +2 -10
  20. unstructured_ingest/v2/processes/connectors/gitlab.py +267 -0
  21. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +13 -0
  22. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +82 -0
  23. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +196 -0
  24. unstructured_ingest/v2/processes/connectors/kafka/local.py +75 -0
  25. unstructured_ingest/v2/processes/connectors/onedrive.py +163 -2
  26. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  27. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  28. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  29. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +168 -0
  30. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  31. unstructured_ingest/v2/processes/connectors/sql/__init__.py +5 -0
  32. unstructured_ingest/v2/processes/connectors/sql/postgres.py +1 -20
  33. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +168 -0
  34. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
  35. unstructured_ingest/v2/processes/connectors/sql/sql.py +15 -6
  36. unstructured_ingest/v2/processes/partitioner.py +14 -3
  37. unstructured_ingest/v2/unstructured_api.py +25 -11
  38. {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/METADATA +17 -17
  39. {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/RECORD +43 -27
  40. unstructured_ingest/v2/processes/connectors/singlestore.py +0 -156
  41. {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/LICENSE.md +0 -0
  42. {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/WHEEL +0 -0
  43. {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/entry_points.txt +0 -0
  44. {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/top_level.txt +0 -0
@@ -7,13 +7,14 @@ from pathlib import Path
7
7
  from typing import Callable, Optional
8
8
 
9
9
  import pandas as pd
10
+ from bs4 import BeautifulSoup
10
11
  from deepdiff import DeepDiff
11
12
 
12
13
  from test.integration.connectors.utils.constants import expected_results_path
13
14
  from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
14
15
 
15
16
 
16
- def pandas_df_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
17
+ def json_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
17
18
  expected_df = pd.read_csv(expected_filepath)
18
19
  current_df = pd.read_csv(current_filepath)
19
20
  if expected_df.equals(current_df):
@@ -27,6 +28,42 @@ def pandas_df_equality_check(expected_filepath: Path, current_filepath: Path) ->
27
28
  return False
28
29
 
29
30
 
31
+ def html_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
32
+ with expected_filepath.open() as expected_f:
33
+ expected_soup = BeautifulSoup(expected_f, "html.parser")
34
+ with current_filepath.open() as current_f:
35
+ current_soup = BeautifulSoup(current_f, "html.parser")
36
+ return expected_soup.text == current_soup.text
37
+
38
+
39
+ def txt_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
40
+ with expected_filepath.open() as expected_f:
41
+ expected_text_lines = expected_f.readlines()
42
+ with current_filepath.open() as current_f:
43
+ current_text_lines = current_f.readlines()
44
+ if len(expected_text_lines) != len(current_text_lines):
45
+ print(
46
+ f"Lines in expected text file ({len(expected_text_lines)}) "
47
+ f"don't match current text file ({len(current_text_lines)})"
48
+ )
49
+ return False
50
+ expected_text = "\n".join(expected_text_lines)
51
+ current_text = "\n".join(current_text_lines)
52
+ if expected_text == current_text:
53
+ return True
54
+ print("txt content don't match:")
55
+ print(f"expected: {expected_text}")
56
+ print(f"current: {current_text}")
57
+ return False
58
+
59
+
60
+ file_type_equality_check = {
61
+ ".json": json_equality_check,
62
+ ".html": html_equality_check,
63
+ ".txt": txt_equality_check,
64
+ }
65
+
66
+
30
67
  @dataclass
31
68
  class ValidationConfigs:
32
69
  test_id: str
@@ -39,6 +76,7 @@ class ValidationConfigs:
39
76
  )
40
77
  exclude_fields_extend: list[str] = field(default_factory=list)
41
78
  validate_downloaded_files: bool = False
79
+ validate_file_data: bool = True
42
80
  downloaded_file_equality_check: Optional[Callable[[Path, Path], bool]] = None
43
81
 
44
82
  def get_exclude_fields(self) -> list[str]:
@@ -86,7 +124,7 @@ class ValidationConfigs:
86
124
 
87
125
  def get_files(dir_path: Path) -> list[str]:
88
126
  return [
89
- str(f).replace(str(dir_path), "").lstrip("/") for f in dir_path.iterdir() if f.is_file()
127
+ str(f).replace(str(dir_path), "").lstrip("/") for f in dir_path.rglob("*") if f.is_file()
90
128
  ]
91
129
 
92
130
 
@@ -122,6 +160,23 @@ def check_contents(
122
160
  assert not found_diff, f"Diffs found between files: {found_diff}"
123
161
 
124
162
 
163
+ def detect_diff(
164
+ configs: ValidationConfigs, expected_filepath: Path, current_filepath: Path
165
+ ) -> bool:
166
+ if expected_filepath.suffix != current_filepath.suffix:
167
+ return True
168
+ if downloaded_file_equality_check := configs.downloaded_file_equality_check:
169
+ return not downloaded_file_equality_check(expected_filepath, current_filepath)
170
+ current_suffix = expected_filepath.suffix
171
+ if current_suffix in file_type_equality_check:
172
+ equality_check_callable = file_type_equality_check[current_suffix]
173
+ return not equality_check_callable(
174
+ expected_filepath=expected_filepath, current_filepath=current_filepath
175
+ )
176
+ # Fallback is using filecmp.cmp to compare the files
177
+ return not filecmp.cmp(expected_filepath, current_filepath, shallow=False)
178
+
179
+
125
180
  def check_raw_file_contents(
126
181
  expected_output_dir: Path,
127
182
  current_output_dir: Path,
@@ -133,15 +188,7 @@ def check_raw_file_contents(
133
188
  for current_file in current_files:
134
189
  current_file_path = current_output_dir / current_file
135
190
  expected_file_path = expected_output_dir / current_file
136
- if downloaded_file_equality_check := configs.downloaded_file_equality_check:
137
- is_different = downloaded_file_equality_check(expected_file_path, current_file_path)
138
- elif expected_file_path.suffix == ".csv" and current_file_path.suffix == ".csv":
139
- is_different = not pandas_df_equality_check(
140
- expected_filepath=expected_file_path, current_filepath=current_file_path
141
- )
142
- else:
143
- is_different = not filecmp.cmp(expected_file_path, current_file_path, shallow=False)
144
- if is_different:
191
+ if detect_diff(configs, expected_file_path, current_file_path):
145
192
  found_diff = True
146
193
  files.append(str(expected_file_path))
147
194
  print(f"diffs between files {expected_file_path} and {current_file_path}")
@@ -185,17 +232,19 @@ def update_fixtures(
185
232
  download_dir: Path,
186
233
  all_file_data: list[FileData],
187
234
  save_downloads: bool = False,
235
+ save_filedata: bool = True,
188
236
  ):
189
237
  # Delete current files
190
238
  shutil.rmtree(path=output_dir, ignore_errors=True)
191
239
  output_dir.mkdir(parents=True)
192
240
  # Rewrite the current file data
193
- file_data_output_path = output_dir / "file_data"
194
- file_data_output_path.mkdir(parents=True, exist_ok=True)
195
- for file_data in all_file_data:
196
- file_data_path = file_data_output_path / f"{file_data.identifier}.json"
197
- with file_data_path.open(mode="w") as f:
198
- json.dump(file_data.to_dict(), f, indent=2)
241
+ if save_filedata:
242
+ file_data_output_path = output_dir / "file_data"
243
+ file_data_output_path.mkdir(parents=True, exist_ok=True)
244
+ for file_data in all_file_data:
245
+ file_data_path = file_data_output_path / f"{file_data.identifier}.json"
246
+ with file_data_path.open(mode="w") as f:
247
+ json.dump(file_data.to_dict(), f, indent=2)
199
248
 
200
249
  # Record file structure of download directory
201
250
  download_files = get_files(dir_path=download_dir)
@@ -229,11 +278,12 @@ def run_all_validations(
229
278
  predownload_file_data=pre_data, postdownload_file_data=post_data
230
279
  )
231
280
  configs.run_download_dir_validation(download_dir=download_dir)
232
- run_expected_results_validation(
233
- expected_output_dir=test_output_dir / "file_data",
234
- all_file_data=postdownload_file_data,
235
- configs=configs,
236
- )
281
+ if configs.validate_file_data:
282
+ run_expected_results_validation(
283
+ expected_output_dir=test_output_dir / "file_data",
284
+ all_file_data=postdownload_file_data,
285
+ configs=configs,
286
+ )
237
287
  download_files = get_files(dir_path=download_dir)
238
288
  download_files.sort()
239
289
  run_directory_structure_validation(
@@ -291,4 +341,5 @@ async def source_connector_validation(
291
341
  download_dir=download_dir,
292
342
  all_file_data=all_postdownload_file_data,
293
343
  save_downloads=configs.validate_downloaded_files,
344
+ save_filedata=configs.validate_file_data,
294
345
  )
@@ -1 +1 @@
1
- __version__ = "0.2.0" # pragma: no cover
1
+ __version__ = "0.2.2" # pragma: no cover
@@ -181,7 +181,6 @@ class KafkaSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
181
181
  logger.debug(f"found {len(collected)} messages, stopping")
182
182
  consumer.commit(asynchronous=False)
183
183
  break
184
-
185
184
  return [
186
185
  KafkaIngestDoc(
187
186
  connector_config=self.connector_config,
@@ -21,6 +21,7 @@ from unstructured_ingest.enhanced_dataclass.core import _asdict
21
21
  from unstructured_ingest.error import PartitionError, SourceConnectionError
22
22
  from unstructured_ingest.logger import logger
23
23
  from unstructured_ingest.utils.data_prep import flatten_dict
24
+ from unstructured_ingest.v2.unstructured_api import call_api
24
25
 
25
26
  if TYPE_CHECKING:
26
27
  from unstructured.documents.elements import Element
@@ -565,6 +566,7 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
565
566
  ) -> list["Element"]:
566
567
  from unstructured.documents.elements import DataSourceMetadata
567
568
  from unstructured.partition.auto import partition
569
+ from unstructured.staging.base import elements_from_dicts
568
570
 
569
571
  if not partition_config.partition_by_api:
570
572
  logger.debug("Using local partition")
@@ -582,18 +584,16 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
582
584
  **partition_kwargs,
583
585
  )
584
586
  else:
585
- from unstructured.partition.api import partition_via_api
586
-
587
587
  endpoint = partition_config.partition_endpoint
588
588
 
589
589
  logger.debug(f"using remote partition ({endpoint})")
590
-
591
- elements = partition_via_api(
592
- filename=str(self.filename),
590
+ elements_dicts = call_api(
591
+ server_url=endpoint,
593
592
  api_key=partition_config.api_key,
594
- api_url=endpoint,
595
- **partition_kwargs,
593
+ filename=Path(self.filename),
594
+ api_parameters=partition_kwargs,
596
595
  )
596
+ elements = elements_from_dicts(elements_dicts)
597
597
  # TODO: add m_data_source_metadata to unstructured-api pipeline_api and then
598
598
  # pass the stringified json here
599
599
  return elements
@@ -43,6 +43,7 @@ class FileData(DataClassJsonMixin):
43
43
  additional_metadata: dict[str, Any] = field(default_factory=dict)
44
44
  reprocess: bool = False
45
45
  local_download_path: Optional[str] = None
46
+ display_name: Optional[str] = None
46
47
 
47
48
  @classmethod
48
49
  def from_file(cls, path: str) -> "FileData":
@@ -9,7 +9,7 @@ from unstructured_ingest.utils.chunking import assign_and_map_hash_ids
9
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
10
  from unstructured_ingest.v2.interfaces.process import BaseProcess
11
11
  from unstructured_ingest.v2.logger import logger
12
- from unstructured_ingest.v2.unstructured_api import call_api
12
+ from unstructured_ingest.v2.unstructured_api import call_api_async
13
13
 
14
14
  CHUNK_MAX_CHARS_DEFAULT: int = 500
15
15
  CHUNK_MULTI_PAGE_DEFAULT: bool = True
@@ -112,7 +112,7 @@ class Chunker(BaseProcess, ABC):
112
112
 
113
113
  @requires_dependencies(dependencies=["unstructured_client"], extras="remote")
114
114
  async def run_async(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
115
- elements = await call_api(
115
+ elements = await call_api_async(
116
116
  server_url=self.config.chunking_endpoint,
117
117
  api_key=self.config.chunk_api_key.get_secret_value(),
118
118
  filename=elements_filepath,
@@ -2,6 +2,8 @@ from __future__ import annotations
2
2
 
3
3
  import unstructured_ingest.v2.processes.connectors.databricks # noqa: F401
4
4
  import unstructured_ingest.v2.processes.connectors.fsspec # noqa: F401
5
+ import unstructured_ingest.v2.processes.connectors.kafka # noqa: F401
6
+ import unstructured_ingest.v2.processes.connectors.qdrant # noqa: F401
5
7
  import unstructured_ingest.v2.processes.connectors.sql # noqa: F401
6
8
  from unstructured_ingest.v2.processes.connector_registry import (
7
9
  add_destination_entry,
@@ -11,17 +13,21 @@ from unstructured_ingest.v2.processes.connector_registry import (
11
13
  from .airtable import CONNECTOR_TYPE as AIRTABLE_CONNECTOR_TYPE
12
14
  from .airtable import airtable_source_entry
13
15
  from .astradb import CONNECTOR_TYPE as ASTRA_DB_CONNECTOR_TYPE
14
- from .astradb import astra_db_destination_entry
16
+ from .astradb import astra_db_destination_entry, astra_db_source_entry
15
17
  from .azure_cognitive_search import CONNECTOR_TYPE as AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE
16
18
  from .azure_cognitive_search import azure_cognitive_search_destination_entry
17
19
  from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
18
20
  from .chroma import chroma_destination_entry
21
+ from .confluence import CONNECTOR_TYPE as CONFLUENCE_CONNECTOR_TYPE
22
+ from .confluence import confluence_source_entry
19
23
  from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
20
24
  from .couchbase import couchbase_destination_entry, couchbase_source_entry
21
25
  from .delta_table import CONNECTOR_TYPE as DELTA_TABLE_CONNECTOR_TYPE
22
26
  from .delta_table import delta_table_destination_entry
23
27
  from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
24
28
  from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
29
+ from .gitlab import CONNECTOR_TYPE as GITLAB_CONNECTOR_TYPE
30
+ from .gitlab import gitlab_source_entry
25
31
  from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
26
32
  from .google_drive import google_drive_source_entry
27
33
  from .kdbai import CONNECTOR_TYPE as KDBAI_CONNECTOR_TYPE
@@ -33,7 +39,7 @@ from .milvus import milvus_destination_entry
33
39
  from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
34
40
  from .mongodb import mongodb_destination_entry, mongodb_source_entry
35
41
  from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
36
- from .onedrive import onedrive_source_entry
42
+ from .onedrive import onedrive_destination_entry, onedrive_source_entry
37
43
  from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
38
44
  from .opensearch import opensearch_destination_entry, opensearch_source_entry
39
45
  from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
@@ -44,13 +50,12 @@ from .salesforce import CONNECTOR_TYPE as SALESFORCE_CONNECTOR_TYPE
44
50
  from .salesforce import salesforce_source_entry
45
51
  from .sharepoint import CONNECTOR_TYPE as SHAREPOINT_CONNECTOR_TYPE
46
52
  from .sharepoint import sharepoint_source_entry
47
- from .singlestore import CONNECTOR_TYPE as SINGLESTORE_CONNECTOR_TYPE
48
- from .singlestore import singlestore_destination_entry
49
53
  from .slack import CONNECTOR_TYPE as SLACK_CONNECTOR_TYPE
50
54
  from .slack import slack_source_entry
51
55
  from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
52
56
  from .weaviate import weaviate_destination_entry
53
57
 
58
+ add_source_entry(source_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_source_entry)
54
59
  add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_destination_entry)
55
60
 
56
61
  add_destination_entry(destination_type=CHROMA_CONNECTOR_TYPE, entry=chroma_destination_entry)
@@ -73,6 +78,7 @@ add_source_entry(source_type=LOCAL_CONNECTOR_TYPE, entry=local_source_entry)
73
78
  add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destination_entry)
74
79
 
75
80
  add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry)
81
+ add_destination_entry(destination_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_destination_entry)
76
82
 
77
83
  add_source_entry(source_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_source_entry)
78
84
  add_destination_entry(
@@ -88,9 +94,7 @@ add_source_entry(source_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_source_entry)
88
94
 
89
95
  add_destination_entry(destination_type=PINECONE_CONNECTOR_TYPE, entry=pinecone_destination_entry)
90
96
  add_source_entry(source_type=SHAREPOINT_CONNECTOR_TYPE, entry=sharepoint_source_entry)
91
- add_destination_entry(
92
- destination_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_destination_entry
93
- )
97
+
94
98
  add_destination_entry(destination_type=MILVUS_CONNECTOR_TYPE, entry=milvus_destination_entry)
95
99
  add_destination_entry(
96
100
  destination_type=AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE,
@@ -102,4 +106,8 @@ add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entr
102
106
 
103
107
  add_source_entry(source_type=OUTLOOK_CONNECTOR_TYPE, entry=outlook_source_entry)
104
108
 
109
+ add_source_entry(source_type=GITLAB_CONNECTOR_TYPE, entry=gitlab_source_entry)
110
+
105
111
  add_source_entry(source_type=SLACK_CONNECTOR_TYPE, entry=slack_source_entry)
112
+
113
+ add_source_entry(source_type=CONFLUENCE_CONNECTOR_TYPE, entry=confluence_source_entry)