unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (187) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/main.py +0 -0
  7. unstructured_ingest/pipeline/interfaces.py +1 -1
  8. unstructured_ingest/pipeline/pipeline.py +1 -1
  9. unstructured_ingest/processes/chunker.py +4 -0
  10. unstructured_ingest/processes/connectors/airtable.py +4 -2
  11. unstructured_ingest/processes/connectors/astradb.py +48 -34
  12. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  13. unstructured_ingest/processes/connectors/confluence.py +0 -1
  14. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  15. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  16. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  18. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  19. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  20. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  21. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  22. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  23. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  24. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  25. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  26. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  27. unstructured_ingest/processes/connectors/outlook.py +1 -2
  28. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  29. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  30. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  31. unstructured_ingest/processes/connectors/slack.py +1 -2
  32. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  33. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  34. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  35. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  37. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  38. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  39. unstructured_ingest/processes/connectors/vectara.py +0 -2
  40. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  41. unstructured_ingest/processes/embedder.py +2 -2
  42. unstructured_ingest/processes/filter.py +1 -1
  43. unstructured_ingest/processes/partitioner.py +4 -0
  44. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  45. unstructured_ingest/unstructured_api.py +13 -8
  46. unstructured_ingest/utils/data_prep.py +8 -32
  47. unstructured_ingest-1.0.2.dist-info/METADATA +226 -0
  48. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/RECORD +50 -184
  49. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/WHEEL +1 -2
  50. examples/__init__.py +0 -0
  51. examples/airtable.py +0 -44
  52. examples/azure_cognitive_search.py +0 -55
  53. examples/chroma.py +0 -54
  54. examples/couchbase.py +0 -55
  55. examples/databricks_volumes_dest.py +0 -55
  56. examples/databricks_volumes_source.py +0 -53
  57. examples/delta_table.py +0 -45
  58. examples/discord_example.py +0 -36
  59. examples/elasticsearch.py +0 -49
  60. examples/google_drive.py +0 -45
  61. examples/kdbai.py +0 -54
  62. examples/local.py +0 -36
  63. examples/milvus.py +0 -44
  64. examples/mongodb.py +0 -53
  65. examples/opensearch.py +0 -50
  66. examples/pinecone.py +0 -57
  67. examples/s3.py +0 -38
  68. examples/salesforce.py +0 -44
  69. examples/sharepoint.py +0 -47
  70. examples/singlestore.py +0 -49
  71. examples/sql.py +0 -90
  72. examples/vectara.py +0 -54
  73. examples/weaviate.py +0 -44
  74. test/__init__.py +0 -0
  75. test/integration/__init__.py +0 -0
  76. test/integration/chunkers/__init__.py +0 -0
  77. test/integration/chunkers/test_chunkers.py +0 -31
  78. test/integration/connectors/__init__.py +0 -0
  79. test/integration/connectors/conftest.py +0 -38
  80. test/integration/connectors/databricks/__init__.py +0 -0
  81. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  82. test/integration/connectors/discord/__init__.py +0 -0
  83. test/integration/connectors/discord/test_discord.py +0 -90
  84. test/integration/connectors/duckdb/__init__.py +0 -0
  85. test/integration/connectors/duckdb/conftest.py +0 -14
  86. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  87. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  88. test/integration/connectors/elasticsearch/__init__.py +0 -0
  89. test/integration/connectors/elasticsearch/conftest.py +0 -34
  90. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  91. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  92. test/integration/connectors/sql/__init__.py +0 -0
  93. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  94. test/integration/connectors/sql/test_postgres.py +0 -201
  95. test/integration/connectors/sql/test_singlestore.py +0 -182
  96. test/integration/connectors/sql/test_snowflake.py +0 -244
  97. test/integration/connectors/sql/test_sqlite.py +0 -168
  98. test/integration/connectors/sql/test_vastdb.py +0 -34
  99. test/integration/connectors/test_astradb.py +0 -287
  100. test/integration/connectors/test_azure_ai_search.py +0 -254
  101. test/integration/connectors/test_chroma.py +0 -136
  102. test/integration/connectors/test_confluence.py +0 -111
  103. test/integration/connectors/test_delta_table.py +0 -183
  104. test/integration/connectors/test_dropbox.py +0 -151
  105. test/integration/connectors/test_github.py +0 -49
  106. test/integration/connectors/test_google_drive.py +0 -257
  107. test/integration/connectors/test_jira.py +0 -67
  108. test/integration/connectors/test_lancedb.py +0 -247
  109. test/integration/connectors/test_milvus.py +0 -208
  110. test/integration/connectors/test_mongodb.py +0 -335
  111. test/integration/connectors/test_neo4j.py +0 -244
  112. test/integration/connectors/test_notion.py +0 -152
  113. test/integration/connectors/test_onedrive.py +0 -163
  114. test/integration/connectors/test_pinecone.py +0 -387
  115. test/integration/connectors/test_qdrant.py +0 -216
  116. test/integration/connectors/test_redis.py +0 -143
  117. test/integration/connectors/test_s3.py +0 -184
  118. test/integration/connectors/test_sharepoint.py +0 -222
  119. test/integration/connectors/test_vectara.py +0 -282
  120. test/integration/connectors/test_zendesk.py +0 -120
  121. test/integration/connectors/utils/__init__.py +0 -0
  122. test/integration/connectors/utils/constants.py +0 -13
  123. test/integration/connectors/utils/docker.py +0 -151
  124. test/integration/connectors/utils/docker_compose.py +0 -59
  125. test/integration/connectors/utils/validation/__init__.py +0 -0
  126. test/integration/connectors/utils/validation/destination.py +0 -77
  127. test/integration/connectors/utils/validation/equality.py +0 -76
  128. test/integration/connectors/utils/validation/source.py +0 -331
  129. test/integration/connectors/utils/validation/utils.py +0 -36
  130. test/integration/connectors/weaviate/__init__.py +0 -0
  131. test/integration/connectors/weaviate/conftest.py +0 -15
  132. test/integration/connectors/weaviate/test_cloud.py +0 -39
  133. test/integration/connectors/weaviate/test_local.py +0 -152
  134. test/integration/embedders/__init__.py +0 -0
  135. test/integration/embedders/conftest.py +0 -13
  136. test/integration/embedders/test_azure_openai.py +0 -57
  137. test/integration/embedders/test_bedrock.py +0 -103
  138. test/integration/embedders/test_huggingface.py +0 -24
  139. test/integration/embedders/test_mixedbread.py +0 -71
  140. test/integration/embedders/test_octoai.py +0 -75
  141. test/integration/embedders/test_openai.py +0 -74
  142. test/integration/embedders/test_togetherai.py +0 -71
  143. test/integration/embedders/test_vertexai.py +0 -63
  144. test/integration/embedders/test_voyageai.py +0 -79
  145. test/integration/embedders/utils.py +0 -66
  146. test/integration/partitioners/__init__.py +0 -0
  147. test/integration/partitioners/test_partitioner.py +0 -76
  148. test/integration/utils.py +0 -15
  149. test/unit/__init__.py +0 -0
  150. test/unit/chunkers/__init__.py +0 -0
  151. test/unit/chunkers/test_chunkers.py +0 -49
  152. test/unit/connectors/__init__.py +0 -0
  153. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  154. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  155. test/unit/connectors/motherduck/__init__.py +0 -0
  156. test/unit/connectors/motherduck/test_base.py +0 -73
  157. test/unit/connectors/sql/__init__.py +0 -0
  158. test/unit/connectors/sql/test_sql.py +0 -152
  159. test/unit/connectors/test_confluence.py +0 -71
  160. test/unit/connectors/test_jira.py +0 -401
  161. test/unit/embed/__init__.py +0 -0
  162. test/unit/embed/test_mixedbreadai.py +0 -42
  163. test/unit/embed/test_octoai.py +0 -27
  164. test/unit/embed/test_openai.py +0 -28
  165. test/unit/embed/test_vertexai.py +0 -25
  166. test/unit/embed/test_voyageai.py +0 -24
  167. test/unit/embedders/__init__.py +0 -0
  168. test/unit/embedders/test_bedrock.py +0 -36
  169. test/unit/embedders/test_huggingface.py +0 -48
  170. test/unit/embedders/test_mixedbread.py +0 -37
  171. test/unit/embedders/test_octoai.py +0 -35
  172. test/unit/embedders/test_openai.py +0 -35
  173. test/unit/embedders/test_togetherai.py +0 -37
  174. test/unit/embedders/test_vertexai.py +0 -37
  175. test/unit/embedders/test_voyageai.py +0 -38
  176. test/unit/partitioners/__init__.py +0 -0
  177. test/unit/partitioners/test_partitioner.py +0 -63
  178. test/unit/test_error.py +0 -27
  179. test/unit/test_html.py +0 -112
  180. test/unit/test_interfaces.py +0 -26
  181. test/unit/test_utils.py +0 -220
  182. test/unit/utils/__init__.py +0 -0
  183. test/unit/utils/data_generator.py +0 -32
  184. unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
  185. unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
  186. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/entry_points.txt +0 -0
  187. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info/licenses}/LICENSE.md +0 -0
test/unit/test_utils.py DELETED
@@ -1,220 +0,0 @@
1
- import base64
2
- import json
3
- import zlib
4
- from datetime import datetime
5
- from typing import Any
6
-
7
- import pytest
8
- import pytz
9
- from pydantic import BaseModel, Field, Secret, SecretStr
10
- from pydantic.types import _SecretBase
11
-
12
- from unstructured_ingest.processes.connectors.utils import format_and_truncate_orig_elements
13
- from unstructured_ingest.utils.pydantic_models import (
14
- serialize_base_model,
15
- serialize_base_model_json,
16
- )
17
- from unstructured_ingest.utils.string_and_date_utils import (
18
- ensure_isoformat_datetime,
19
- fix_unescaped_unicode,
20
- json_to_dict,
21
- truncate_string_bytes,
22
- )
23
-
24
- flat_data = {"a": "test", "b": 4, "c": True}
25
-
26
-
27
- def test_json_to_dict_valid_json():
28
- json_string = '{"key": "value"}'
29
- expected_result = {"key": "value"}
30
- assert json_to_dict(json_string) == expected_result
31
- assert isinstance(json_to_dict(json_string), dict)
32
-
33
-
34
- def test_json_to_dict_malformed_json():
35
- json_string = '{"key": "value"'
36
- expected_result = '{"key": "value"'
37
- assert json_to_dict(json_string) == expected_result
38
- assert isinstance(json_to_dict(json_string), str)
39
-
40
-
41
- def test_json_to_dict_single_quotes():
42
- json_string = "{'key': 'value'}"
43
- expected_result = {"key": "value"}
44
- assert json_to_dict(json_string) == expected_result
45
- assert isinstance(json_to_dict(json_string), dict)
46
-
47
-
48
- def test_json_to_dict_path():
49
- json_string = "/path/to/file.json"
50
- expected_result = "/path/to/file.json"
51
- assert json_to_dict(json_string) == expected_result
52
- assert isinstance(json_to_dict(json_string), str)
53
-
54
-
55
- def test_ensure_isoformat_datetime_for_datetime():
56
- dt = ensure_isoformat_datetime(datetime(2021, 1, 1, 12, 0, 0))
57
- assert dt == "2021-01-01T12:00:00"
58
-
59
-
60
- def test_ensure_isoformat_datetime_for_datetime_with_tz():
61
- dt = ensure_isoformat_datetime(datetime(2021, 1, 1, 12, 0, 0, tzinfo=pytz.UTC))
62
- assert dt == "2021-01-01T12:00:00+00:00"
63
-
64
-
65
- def test_ensure_isoformat_datetime_for_string():
66
- dt = ensure_isoformat_datetime("2021-01-01T12:00:00")
67
- assert dt == "2021-01-01T12:00:00"
68
-
69
-
70
- def test_ensure_isoformat_datetime_for_string2():
71
- dt = ensure_isoformat_datetime("2021-01-01T12:00:00+00:00")
72
- assert dt == "2021-01-01T12:00:00+00:00"
73
-
74
-
75
- def test_ensure_isoformat_datetime_fails_on_string():
76
- with pytest.raises(ValueError):
77
- ensure_isoformat_datetime("bad timestamp")
78
-
79
-
80
- def test_ensure_isoformat_datetime_fails_on_int():
81
- with pytest.raises(TypeError):
82
- ensure_isoformat_datetime(1111)
83
-
84
-
85
- def test_truncate_string_bytes_return_truncated_string():
86
- test_string = "abcdef안녕하세요ghijklmn방갑습니opqrstu 더 길어지면 안되는 문자열vwxyz"
87
- max_bytes = 11
88
- result = truncate_string_bytes(test_string, max_bytes)
89
- assert result == "abcdef안"
90
- assert len(result.encode("utf-8")) <= max_bytes
91
-
92
-
93
- def test_truncate_string_bytes_return_untouched_string():
94
- test_string = "abcdef"
95
- max_bytes = 11
96
- result = truncate_string_bytes(test_string, max_bytes)
97
- assert result == "abcdef"
98
- assert len(result.encode("utf-8")) <= max_bytes
99
-
100
-
101
- def test_fix_unescaped_unicode_valid():
102
- text = "This is a test with unescaped unicode: \\u0041"
103
- expected = "This is a test with unescaped unicode: \u0041"
104
- assert fix_unescaped_unicode(text) == expected
105
-
106
-
107
- def test_fix_unescaped_unicode_no_unescaped_chars():
108
- text = "This is a test with no unescaped unicode: \u0041"
109
- expected = "This is a test with no unescaped unicode: \u0041"
110
- assert fix_unescaped_unicode(text) == expected
111
-
112
-
113
- def test_fix_unescaped_unicode_invalid_unicode():
114
- text = "This is a test with invalid unescaped unicode: \\uZZZZ"
115
- expected = "This is a test with invalid unescaped unicode: \\uZZZZ"
116
- assert fix_unescaped_unicode(text) == expected
117
-
118
-
119
- def test_fix_unescaped_unicode_encoding_error(caplog: pytest.LogCaptureFixture):
120
- text = "This is a test with unescaped unicode: \\uD83D"
121
- fix_unescaped_unicode(text)
122
- with caplog.at_level("WARNING"):
123
- fix_unescaped_unicode(text)
124
- assert "Failed to fix unescaped Unicode sequences" in caplog.text
125
-
126
-
127
- class MockChildBaseModel(BaseModel):
128
- child_secret_str: SecretStr
129
- child_secret_float: Secret[float]
130
- child_not_secret_dict: dict[str, Any] = Field(default_factory=dict)
131
-
132
-
133
- class MockBaseModel(BaseModel):
134
- secret_str: SecretStr
135
- not_secret_bool: bool
136
- secret_child_base: Secret[MockChildBaseModel]
137
- not_secret_list: list[int] = Field(default_factory=list)
138
-
139
-
140
- model = MockBaseModel(
141
- secret_str="secret string",
142
- not_secret_bool=False,
143
- secret_child_base=MockChildBaseModel(
144
- child_secret_str="child secret string",
145
- child_secret_float=3.14,
146
- child_not_secret_dict={"key": "value"},
147
- ),
148
- not_secret_list=[1, 2, 3],
149
- )
150
-
151
-
152
- def test_serialize_base_model():
153
-
154
- serialized_dict = model.model_dump()
155
- assert isinstance(serialized_dict["secret_str"], _SecretBase)
156
- assert isinstance(serialized_dict["secret_child_base"], _SecretBase)
157
-
158
- serialized_dict_w_secrets = serialize_base_model(model=model)
159
- assert not isinstance(serialized_dict_w_secrets["secret_str"], _SecretBase)
160
- assert not isinstance(serialized_dict_w_secrets["secret_child_base"], _SecretBase)
161
-
162
- expected_dict = {
163
- "secret_str": "secret string",
164
- "not_secret_bool": False,
165
- "secret_child_base": {
166
- "child_secret_str": "child secret string",
167
- "child_secret_float": 3.14,
168
- "child_not_secret_dict": {"key": "value"},
169
- },
170
- "not_secret_list": [1, 2, 3],
171
- }
172
-
173
- assert serialized_dict_w_secrets == expected_dict
174
-
175
-
176
- def test_serialize_base_model_json():
177
- serialized_json = model.model_dump_json()
178
- serialized_dict = json.loads(serialized_json)
179
- expected_dict = {
180
- "secret_str": "**********",
181
- "not_secret_bool": False,
182
- "secret_child_base": "**********",
183
- "not_secret_list": [1, 2, 3],
184
- }
185
- assert expected_dict == serialized_dict
186
-
187
- serialized_json_w_secrets = serialize_base_model_json(model=model)
188
- serialized_dict_w_secrets = json.loads(serialized_json_w_secrets)
189
- expected_dict_w_secrets = {
190
- "secret_str": "secret string",
191
- "not_secret_bool": False,
192
- "secret_child_base": {
193
- "child_secret_str": "child secret string",
194
- "child_secret_float": 3.14,
195
- "child_not_secret_dict": {"key": "value"},
196
- },
197
- "not_secret_list": [1, 2, 3],
198
- }
199
- assert expected_dict_w_secrets == serialized_dict_w_secrets
200
-
201
-
202
- def test_format_and_truncate_orig_elements():
203
- original_elements = [
204
- {
205
- "text": "Hello, world!",
206
- "metadata": {
207
- "image_base64": "iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAABwUlEQVR42mNk",
208
- "text_as_html": "<p>Hello, world!</p>",
209
- "page": 1,
210
- },
211
- }
212
- ]
213
- json_bytes = json.dumps(original_elements, sort_keys=True).encode("utf-8")
214
- deflated_bytes = zlib.compress(json_bytes)
215
- b64_deflated_bytes = base64.b64encode(deflated_bytes)
216
- b64_deflated_bytes.decode("utf-8")
217
-
218
- assert format_and_truncate_orig_elements(
219
- {"text": "Hello, world!", "metadata": {"orig_elements": b64_deflated_bytes.decode("utf-8")}}
220
- ) == [{"metadata": {"page": 1}}]
File without changes
@@ -1,32 +0,0 @@
1
- import random
2
- from typing import Any, Type
3
-
4
- from faker import Faker
5
-
6
- fake = Faker()
7
-
8
- type_to_random_value_map = {
9
- str: fake.sentence,
10
- int: fake.random_int,
11
- float: fake.random_digit,
12
- bool: fake.boolean,
13
- }
14
- type_to_random_value_map_key = type_to_random_value_map.copy()
15
- type_to_random_value_map_key[str] = fake.word
16
-
17
-
18
- def generate_random_dictionary(key_type: Type = str, value_type: Type = str) -> dict:
19
- d = {}
20
- num_keys = random.randint(1, 3)
21
- for i in range(num_keys):
22
- key = type_to_random_value_map_key[key_type]()
23
- current_value_type = value_type
24
- if current_value_type == Any:
25
- current_value_type = random.choice(list(type_to_random_value_map.keys()) + [dict])
26
- value = (
27
- generate_random_dictionary(key_type=key_type, value_type=value_type)
28
- if current_value_type is dict
29
- else type_to_random_value_map[current_value_type]()
30
- )
31
- d[key] = value
32
- return d
@@ -1,383 +0,0 @@
1
- Metadata-Version: 2.2
2
- Name: unstructured-ingest
3
- Version: 0.7.2
4
- Summary: A library that prepares raw documents for downstream ML tasks.
5
- Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
- Author: Unstructured Technologies
7
- Author-email: devops@unstructuredai.io
8
- License: Apache-2.0
9
- Keywords: NLP PDF HTML CV XML parsing preprocessing
10
- Classifier: Development Status :: 4 - Beta
11
- Classifier: Intended Audience :: Developers
12
- Classifier: Intended Audience :: Education
13
- Classifier: Intended Audience :: Science/Research
14
- Classifier: License :: OSI Approved :: Apache Software License
15
- Classifier: Operating System :: OS Independent
16
- Classifier: Programming Language :: Python :: 3
17
- Classifier: Programming Language :: Python :: 3.9
18
- Classifier: Programming Language :: Python :: 3.10
19
- Classifier: Programming Language :: Python :: 3.11
20
- Classifier: Programming Language :: Python :: 3.12
21
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
- Requires-Python: >=3.9.0,<3.14
23
- Description-Content-Type: text/markdown
24
- License-File: LICENSE.md
25
- Requires-Dist: tqdm
26
- Requires-Dist: opentelemetry-sdk
27
- Requires-Dist: click
28
- Requires-Dist: python-dateutil
29
- Requires-Dist: pydantic>=2.7
30
- Requires-Dist: dataclasses_json
31
- Requires-Dist: numpy
32
- Requires-Dist: pandas
33
- Provides-Extra: remote
34
- Requires-Dist: unstructured-client>=0.30.0; extra == "remote"
35
- Requires-Dist: numpy; extra == "remote"
36
- Requires-Dist: pandas; extra == "remote"
37
- Provides-Extra: csv
38
- Requires-Dist: unstructured[tsv]; extra == "csv"
39
- Requires-Dist: numpy; extra == "csv"
40
- Requires-Dist: pandas; extra == "csv"
41
- Provides-Extra: doc
42
- Requires-Dist: unstructured[docx]; extra == "doc"
43
- Requires-Dist: numpy; extra == "doc"
44
- Requires-Dist: pandas; extra == "doc"
45
- Provides-Extra: docx
46
- Requires-Dist: unstructured[docx]; extra == "docx"
47
- Requires-Dist: numpy; extra == "docx"
48
- Requires-Dist: pandas; extra == "docx"
49
- Provides-Extra: epub
50
- Requires-Dist: unstructured[epub]; extra == "epub"
51
- Requires-Dist: numpy; extra == "epub"
52
- Requires-Dist: pandas; extra == "epub"
53
- Provides-Extra: md
54
- Requires-Dist: unstructured[md]; extra == "md"
55
- Requires-Dist: numpy; extra == "md"
56
- Requires-Dist: pandas; extra == "md"
57
- Provides-Extra: msg
58
- Requires-Dist: unstructured[msg]; extra == "msg"
59
- Requires-Dist: numpy; extra == "msg"
60
- Requires-Dist: pandas; extra == "msg"
61
- Provides-Extra: odt
62
- Requires-Dist: unstructured[odt]; extra == "odt"
63
- Requires-Dist: numpy; extra == "odt"
64
- Requires-Dist: pandas; extra == "odt"
65
- Provides-Extra: org
66
- Requires-Dist: unstructured[org]; extra == "org"
67
- Requires-Dist: numpy; extra == "org"
68
- Requires-Dist: pandas; extra == "org"
69
- Provides-Extra: pdf
70
- Requires-Dist: unstructured[pdf]; extra == "pdf"
71
- Requires-Dist: numpy; extra == "pdf"
72
- Requires-Dist: pandas; extra == "pdf"
73
- Provides-Extra: ppt
74
- Requires-Dist: unstructured[pptx]; extra == "ppt"
75
- Requires-Dist: numpy; extra == "ppt"
76
- Requires-Dist: pandas; extra == "ppt"
77
- Provides-Extra: pptx
78
- Requires-Dist: unstructured[pptx]; extra == "pptx"
79
- Requires-Dist: numpy; extra == "pptx"
80
- Requires-Dist: pandas; extra == "pptx"
81
- Provides-Extra: rtf
82
- Requires-Dist: unstructured[rtf]; extra == "rtf"
83
- Requires-Dist: numpy; extra == "rtf"
84
- Requires-Dist: pandas; extra == "rtf"
85
- Provides-Extra: rst
86
- Requires-Dist: unstructured[rst]; extra == "rst"
87
- Requires-Dist: numpy; extra == "rst"
88
- Requires-Dist: pandas; extra == "rst"
89
- Provides-Extra: tsv
90
- Requires-Dist: unstructured[tsv]; extra == "tsv"
91
- Requires-Dist: numpy; extra == "tsv"
92
- Requires-Dist: pandas; extra == "tsv"
93
- Provides-Extra: xlsx
94
- Requires-Dist: unstructured[xlsx]; extra == "xlsx"
95
- Requires-Dist: numpy; extra == "xlsx"
96
- Requires-Dist: pandas; extra == "xlsx"
97
- Provides-Extra: airtable
98
- Requires-Dist: pyairtable; extra == "airtable"
99
- Requires-Dist: numpy; extra == "airtable"
100
- Requires-Dist: pandas; extra == "airtable"
101
- Provides-Extra: astradb
102
- Requires-Dist: astrapy; extra == "astradb"
103
- Requires-Dist: numpy; extra == "astradb"
104
- Requires-Dist: pandas; extra == "astradb"
105
- Provides-Extra: azure
106
- Requires-Dist: adlfs; extra == "azure"
107
- Requires-Dist: fsspec; extra == "azure"
108
- Requires-Dist: numpy; extra == "azure"
109
- Requires-Dist: pandas; extra == "azure"
110
- Provides-Extra: azure-ai-search
111
- Requires-Dist: azure-search-documents; extra == "azure-ai-search"
112
- Requires-Dist: numpy; extra == "azure-ai-search"
113
- Requires-Dist: pandas; extra == "azure-ai-search"
114
- Provides-Extra: biomed
115
- Requires-Dist: requests; extra == "biomed"
116
- Requires-Dist: bs4; extra == "biomed"
117
- Requires-Dist: numpy; extra == "biomed"
118
- Requires-Dist: pandas; extra == "biomed"
119
- Provides-Extra: box
120
- Requires-Dist: boxfs; extra == "box"
121
- Requires-Dist: fsspec; extra == "box"
122
- Requires-Dist: numpy; extra == "box"
123
- Requires-Dist: pandas; extra == "box"
124
- Provides-Extra: chroma
125
- Requires-Dist: chromadb; extra == "chroma"
126
- Requires-Dist: numpy; extra == "chroma"
127
- Requires-Dist: pandas; extra == "chroma"
128
- Provides-Extra: clarifai
129
- Requires-Dist: clarifai; extra == "clarifai"
130
- Requires-Dist: numpy; extra == "clarifai"
131
- Requires-Dist: pandas; extra == "clarifai"
132
- Provides-Extra: confluence
133
- Requires-Dist: atlassian-python-api; extra == "confluence"
134
- Requires-Dist: requests; extra == "confluence"
135
- Requires-Dist: numpy; extra == "confluence"
136
- Requires-Dist: pandas; extra == "confluence"
137
- Provides-Extra: couchbase
138
- Requires-Dist: couchbase; extra == "couchbase"
139
- Requires-Dist: numpy; extra == "couchbase"
140
- Requires-Dist: pandas; extra == "couchbase"
141
- Provides-Extra: delta-table
142
- Requires-Dist: deltalake; extra == "delta-table"
143
- Requires-Dist: boto3; extra == "delta-table"
144
- Requires-Dist: numpy; extra == "delta-table"
145
- Requires-Dist: pandas; extra == "delta-table"
146
- Provides-Extra: discord
147
- Requires-Dist: discord.py; extra == "discord"
148
- Requires-Dist: numpy; extra == "discord"
149
- Requires-Dist: pandas; extra == "discord"
150
- Provides-Extra: dropbox
151
- Requires-Dist: dropboxdrivefs; extra == "dropbox"
152
- Requires-Dist: fsspec; extra == "dropbox"
153
- Requires-Dist: numpy; extra == "dropbox"
154
- Requires-Dist: pandas; extra == "dropbox"
155
- Provides-Extra: duckdb
156
- Requires-Dist: duckdb; extra == "duckdb"
157
- Requires-Dist: numpy; extra == "duckdb"
158
- Requires-Dist: pandas; extra == "duckdb"
159
- Provides-Extra: elasticsearch
160
- Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
161
- Requires-Dist: numpy; extra == "elasticsearch"
162
- Requires-Dist: pandas; extra == "elasticsearch"
163
- Provides-Extra: gcs
164
- Requires-Dist: gcsfs; extra == "gcs"
165
- Requires-Dist: bs4; extra == "gcs"
166
- Requires-Dist: fsspec; extra == "gcs"
167
- Requires-Dist: numpy; extra == "gcs"
168
- Requires-Dist: pandas; extra == "gcs"
169
- Provides-Extra: github
170
- Requires-Dist: requests; extra == "github"
171
- Requires-Dist: pygithub>1.58.0; extra == "github"
172
- Requires-Dist: numpy; extra == "github"
173
- Requires-Dist: pandas; extra == "github"
174
- Provides-Extra: gitlab
175
- Requires-Dist: python-gitlab; extra == "gitlab"
176
- Requires-Dist: numpy; extra == "gitlab"
177
- Requires-Dist: pandas; extra == "gitlab"
178
- Provides-Extra: google-drive
179
- Requires-Dist: google-api-python-client; extra == "google-drive"
180
- Requires-Dist: numpy; extra == "google-drive"
181
- Requires-Dist: pandas; extra == "google-drive"
182
- Provides-Extra: hubspot
183
- Requires-Dist: urllib3; extra == "hubspot"
184
- Requires-Dist: hubspot-api-client; extra == "hubspot"
185
- Requires-Dist: numpy; extra == "hubspot"
186
- Requires-Dist: pandas; extra == "hubspot"
187
- Provides-Extra: ibm-watsonx-s3
188
- Requires-Dist: tenacity; extra == "ibm-watsonx-s3"
189
- Requires-Dist: httpx; extra == "ibm-watsonx-s3"
190
- Requires-Dist: pyiceberg; extra == "ibm-watsonx-s3"
191
- Requires-Dist: pyarrow; extra == "ibm-watsonx-s3"
192
- Requires-Dist: numpy; extra == "ibm-watsonx-s3"
193
- Requires-Dist: pandas; extra == "ibm-watsonx-s3"
194
- Provides-Extra: jira
195
- Requires-Dist: atlassian-python-api; extra == "jira"
196
- Requires-Dist: numpy; extra == "jira"
197
- Requires-Dist: pandas; extra == "jira"
198
- Provides-Extra: kafka
199
- Requires-Dist: confluent-kafka; extra == "kafka"
200
- Requires-Dist: numpy; extra == "kafka"
201
- Requires-Dist: pandas; extra == "kafka"
202
- Provides-Extra: kdbai
203
- Requires-Dist: kdbai-client>=1.4.0; extra == "kdbai"
204
- Requires-Dist: numpy; extra == "kdbai"
205
- Requires-Dist: pandas; extra == "kdbai"
206
- Provides-Extra: lancedb
207
- Requires-Dist: lancedb; extra == "lancedb"
208
- Requires-Dist: numpy; extra == "lancedb"
209
- Requires-Dist: pandas; extra == "lancedb"
210
- Provides-Extra: milvus
211
- Requires-Dist: pymilvus; extra == "milvus"
212
- Requires-Dist: numpy; extra == "milvus"
213
- Requires-Dist: pandas; extra == "milvus"
214
- Provides-Extra: mongodb
215
- Requires-Dist: pymongo; extra == "mongodb"
216
- Requires-Dist: numpy; extra == "mongodb"
217
- Requires-Dist: pandas; extra == "mongodb"
218
- Provides-Extra: neo4j
219
- Requires-Dist: neo4j-rust-ext; extra == "neo4j"
220
- Requires-Dist: networkx; extra == "neo4j"
221
- Requires-Dist: cymple; extra == "neo4j"
222
- Requires-Dist: numpy; extra == "neo4j"
223
- Requires-Dist: pandas; extra == "neo4j"
224
- Provides-Extra: notion
225
- Requires-Dist: notion-client; extra == "notion"
226
- Requires-Dist: httpx; extra == "notion"
227
- Requires-Dist: backoff; extra == "notion"
228
- Requires-Dist: htmlBuilder; extra == "notion"
229
- Requires-Dist: numpy; extra == "notion"
230
- Requires-Dist: pandas; extra == "notion"
231
- Provides-Extra: onedrive
232
- Requires-Dist: requests; extra == "onedrive"
233
- Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
234
- Requires-Dist: msal; extra == "onedrive"
235
- Requires-Dist: numpy; extra == "onedrive"
236
- Requires-Dist: pandas; extra == "onedrive"
237
- Provides-Extra: opensearch
238
- Requires-Dist: opensearch-py; extra == "opensearch"
239
- Requires-Dist: numpy; extra == "opensearch"
240
- Requires-Dist: pandas; extra == "opensearch"
241
- Provides-Extra: outlook
242
- Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
243
- Requires-Dist: msal; extra == "outlook"
244
- Requires-Dist: numpy; extra == "outlook"
245
- Requires-Dist: pandas; extra == "outlook"
246
- Provides-Extra: pinecone
247
- Requires-Dist: pinecone-client>=3.7.1; extra == "pinecone"
248
- Requires-Dist: numpy; extra == "pinecone"
249
- Requires-Dist: pandas; extra == "pinecone"
250
- Provides-Extra: postgres
251
- Requires-Dist: psycopg2-binary; extra == "postgres"
252
- Requires-Dist: numpy; extra == "postgres"
253
- Requires-Dist: pandas; extra == "postgres"
254
- Provides-Extra: qdrant
255
- Requires-Dist: qdrant-client; extra == "qdrant"
256
- Requires-Dist: numpy; extra == "qdrant"
257
- Requires-Dist: pandas; extra == "qdrant"
258
- Provides-Extra: reddit
259
- Requires-Dist: praw; extra == "reddit"
260
- Requires-Dist: numpy; extra == "reddit"
261
- Requires-Dist: pandas; extra == "reddit"
262
- Provides-Extra: redis
263
- Requires-Dist: redis; extra == "redis"
264
- Requires-Dist: numpy; extra == "redis"
265
- Requires-Dist: pandas; extra == "redis"
266
- Provides-Extra: s3
267
- Requires-Dist: s3fs; extra == "s3"
268
- Requires-Dist: fsspec; extra == "s3"
269
- Requires-Dist: numpy; extra == "s3"
270
- Requires-Dist: pandas; extra == "s3"
271
- Provides-Extra: sharepoint
272
- Requires-Dist: requests; extra == "sharepoint"
273
- Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
274
- Requires-Dist: msal; extra == "sharepoint"
275
- Requires-Dist: numpy; extra == "sharepoint"
276
- Requires-Dist: pandas; extra == "sharepoint"
277
- Provides-Extra: salesforce
278
- Requires-Dist: simple-salesforce; extra == "salesforce"
279
- Requires-Dist: numpy; extra == "salesforce"
280
- Requires-Dist: pandas; extra == "salesforce"
281
- Provides-Extra: sftp
282
- Requires-Dist: paramiko; extra == "sftp"
283
- Requires-Dist: fsspec; extra == "sftp"
284
- Requires-Dist: numpy; extra == "sftp"
285
- Requires-Dist: pandas; extra == "sftp"
286
- Provides-Extra: slack
287
- Requires-Dist: slack_sdk[optional]; extra == "slack"
288
- Requires-Dist: numpy; extra == "slack"
289
- Requires-Dist: pandas; extra == "slack"
290
- Provides-Extra: snowflake
291
- Requires-Dist: snowflake-connector-python; extra == "snowflake"
292
- Requires-Dist: psycopg2-binary; extra == "snowflake"
293
- Requires-Dist: numpy; extra == "snowflake"
294
- Requires-Dist: pandas; extra == "snowflake"
295
- Provides-Extra: wikipedia
296
- Requires-Dist: wikipedia; extra == "wikipedia"
297
- Requires-Dist: numpy; extra == "wikipedia"
298
- Requires-Dist: pandas; extra == "wikipedia"
299
- Provides-Extra: weaviate
300
- Requires-Dist: weaviate-client; extra == "weaviate"
301
- Requires-Dist: numpy; extra == "weaviate"
302
- Requires-Dist: pandas; extra == "weaviate"
303
- Provides-Extra: databricks-volumes
304
- Requires-Dist: databricks-sdk; extra == "databricks-volumes"
305
- Requires-Dist: numpy; extra == "databricks-volumes"
306
- Requires-Dist: pandas; extra == "databricks-volumes"
307
- Provides-Extra: databricks-delta-tables
308
- Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
309
- Requires-Dist: numpy; extra == "databricks-delta-tables"
310
- Requires-Dist: pandas; extra == "databricks-delta-tables"
311
- Provides-Extra: singlestore
312
- Requires-Dist: singlestoredb; extra == "singlestore"
313
- Requires-Dist: numpy; extra == "singlestore"
314
- Requires-Dist: pandas; extra == "singlestore"
315
- Provides-Extra: vectara
316
- Requires-Dist: requests; extra == "vectara"
317
- Requires-Dist: httpx; extra == "vectara"
318
- Requires-Dist: aiofiles; extra == "vectara"
319
- Requires-Dist: numpy; extra == "vectara"
320
- Requires-Dist: pandas; extra == "vectara"
321
- Provides-Extra: vastdb
322
- Requires-Dist: ibis; extra == "vastdb"
323
- Requires-Dist: vastdb; extra == "vastdb"
324
- Requires-Dist: pyarrow; extra == "vastdb"
325
- Requires-Dist: numpy; extra == "vastdb"
326
- Requires-Dist: pandas; extra == "vastdb"
327
- Provides-Extra: zendesk
328
- Requires-Dist: aiofiles; extra == "zendesk"
329
- Requires-Dist: bs4; extra == "zendesk"
330
- Requires-Dist: httpx; extra == "zendesk"
331
- Requires-Dist: numpy; extra == "zendesk"
332
- Requires-Dist: pandas; extra == "zendesk"
333
- Provides-Extra: embed-huggingface
334
- Requires-Dist: sentence-transformers; extra == "embed-huggingface"
335
- Requires-Dist: numpy; extra == "embed-huggingface"
336
- Requires-Dist: pandas; extra == "embed-huggingface"
337
- Provides-Extra: embed-octoai
338
- Requires-Dist: openai; extra == "embed-octoai"
339
- Requires-Dist: tiktoken; extra == "embed-octoai"
340
- Requires-Dist: numpy; extra == "embed-octoai"
341
- Requires-Dist: pandas; extra == "embed-octoai"
342
- Provides-Extra: embed-vertexai
343
- Requires-Dist: vertexai; extra == "embed-vertexai"
344
- Requires-Dist: numpy; extra == "embed-vertexai"
345
- Requires-Dist: pandas; extra == "embed-vertexai"
346
- Provides-Extra: embed-voyageai
347
- Requires-Dist: voyageai; extra == "embed-voyageai"
348
- Requires-Dist: numpy; extra == "embed-voyageai"
349
- Requires-Dist: pandas; extra == "embed-voyageai"
350
- Provides-Extra: embed-mixedbreadai
351
- Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
352
- Requires-Dist: numpy; extra == "embed-mixedbreadai"
353
- Requires-Dist: pandas; extra == "embed-mixedbreadai"
354
- Provides-Extra: openai
355
- Requires-Dist: openai; extra == "openai"
356
- Requires-Dist: tiktoken; extra == "openai"
357
- Requires-Dist: numpy; extra == "openai"
358
- Requires-Dist: pandas; extra == "openai"
359
- Provides-Extra: bedrock
360
- Requires-Dist: aioboto3; extra == "bedrock"
361
- Requires-Dist: boto3; extra == "bedrock"
362
- Requires-Dist: numpy; extra == "bedrock"
363
- Requires-Dist: pandas; extra == "bedrock"
364
- Provides-Extra: togetherai
365
- Requires-Dist: together; extra == "togetherai"
366
- Requires-Dist: numpy; extra == "togetherai"
367
- Requires-Dist: pandas; extra == "togetherai"
368
- Dynamic: author
369
- Dynamic: author-email
370
- Dynamic: classifier
371
- Dynamic: description
372
- Dynamic: description-content-type
373
- Dynamic: home-page
374
- Dynamic: keywords
375
- Dynamic: license
376
- Dynamic: provides-extra
377
- Dynamic: requires-dist
378
- Dynamic: requires-python
379
- Dynamic: summary
380
-
381
- # Unstructured Ingest
382
-
383
- For details, see the [Unstructured Ingest overview](https://docs.unstructured.io/ingestion/overview) in the Unstructured documentation.
@@ -1,3 +0,0 @@
1
- examples
2
- test
3
- unstructured_ingest