unstructured-ingest 0.3.11__py3-none-any.whl → 0.3.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (36) hide show
  1. test/integration/connectors/test_milvus.py +13 -0
  2. test/integration/connectors/test_onedrive.py +6 -0
  3. test/integration/connectors/test_redis.py +119 -0
  4. test/integration/connectors/test_vectara.py +270 -0
  5. test/integration/embedders/test_bedrock.py +28 -0
  6. test/integration/embedders/test_octoai.py +14 -0
  7. test/integration/embedders/test_openai.py +13 -0
  8. test/integration/embedders/test_togetherai.py +10 -0
  9. test/integration/partitioners/test_partitioner.py +2 -2
  10. test/unit/embed/test_octoai.py +8 -1
  11. unstructured_ingest/__version__.py +1 -1
  12. unstructured_ingest/embed/bedrock.py +39 -11
  13. unstructured_ingest/embed/interfaces.py +5 -0
  14. unstructured_ingest/embed/octoai.py +44 -3
  15. unstructured_ingest/embed/openai.py +37 -1
  16. unstructured_ingest/embed/togetherai.py +28 -1
  17. unstructured_ingest/embed/voyageai.py +33 -1
  18. unstructured_ingest/v2/errors.py +18 -0
  19. unstructured_ingest/v2/processes/connectors/__init__.py +7 -0
  20. unstructured_ingest/v2/processes/connectors/chroma.py +0 -1
  21. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +5 -2
  22. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +14 -3
  23. unstructured_ingest/v2/processes/connectors/milvus.py +15 -6
  24. unstructured_ingest/v2/processes/connectors/neo4j.py +2 -0
  25. unstructured_ingest/v2/processes/connectors/onedrive.py +79 -25
  26. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +0 -1
  27. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  28. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  29. unstructured_ingest/v2/unstructured_api.py +25 -2
  30. {unstructured_ingest-0.3.11.dist-info → unstructured_ingest-0.3.12.dist-info}/METADATA +23 -19
  31. {unstructured_ingest-0.3.11.dist-info → unstructured_ingest-0.3.12.dist-info}/RECORD +35 -31
  32. test/integration/connectors/test_kafka.py +0 -304
  33. {unstructured_ingest-0.3.11.dist-info → unstructured_ingest-0.3.12.dist-info}/LICENSE.md +0 -0
  34. {unstructured_ingest-0.3.11.dist-info → unstructured_ingest-0.3.12.dist-info}/WHEEL +0 -0
  35. {unstructured_ingest-0.3.11.dist-info → unstructured_ingest-0.3.12.dist-info}/entry_points.txt +0 -0
  36. {unstructured_ingest-0.3.11.dist-info → unstructured_ingest-0.3.12.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.3.11
3
+ Version: 0.3.12
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -23,13 +23,13 @@ Requires-Python: >=3.9.0,<3.13
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
25
  Requires-Dist: ndjson
26
+ Requires-Dist: python-dateutil
27
+ Requires-Dist: pydantic>=2.7
26
28
  Requires-Dist: pandas
27
29
  Requires-Dist: dataclasses-json
28
- Requires-Dist: pydantic>=2.7
29
- Requires-Dist: opentelemetry-sdk
30
- Requires-Dist: python-dateutil
31
- Requires-Dist: click
32
30
  Requires-Dist: tqdm
31
+ Requires-Dist: click
32
+ Requires-Dist: opentelemetry-sdk
33
33
  Provides-Extra: airtable
34
34
  Requires-Dist: pyairtable; extra == "airtable"
35
35
  Provides-Extra: astradb
@@ -45,8 +45,8 @@ Provides-Extra: biomed
45
45
  Requires-Dist: requests; extra == "biomed"
46
46
  Requires-Dist: bs4; extra == "biomed"
47
47
  Provides-Extra: box
48
- Requires-Dist: fsspec; extra == "box"
49
48
  Requires-Dist: boxfs; extra == "box"
49
+ Requires-Dist: fsspec; extra == "box"
50
50
  Provides-Extra: chroma
51
51
  Requires-Dist: chromadb; extra == "chroma"
52
52
  Provides-Extra: clarifai
@@ -61,8 +61,8 @@ Requires-Dist: unstructured[tsv]; extra == "csv"
61
61
  Provides-Extra: databricks-volumes
62
62
  Requires-Dist: databricks-sdk; extra == "databricks-volumes"
63
63
  Provides-Extra: delta-table
64
- Requires-Dist: boto3; extra == "delta-table"
65
64
  Requires-Dist: deltalake; extra == "delta-table"
65
+ Requires-Dist: boto3; extra == "delta-table"
66
66
  Provides-Extra: discord
67
67
  Requires-Dist: discord-py; extra == "discord"
68
68
  Provides-Extra: doc
@@ -70,8 +70,8 @@ Requires-Dist: unstructured[docx]; extra == "doc"
70
70
  Provides-Extra: docx
71
71
  Requires-Dist: unstructured[docx]; extra == "docx"
72
72
  Provides-Extra: dropbox
73
- Requires-Dist: fsspec; extra == "dropbox"
74
73
  Requires-Dist: dropboxdrivefs; extra == "dropbox"
74
+ Requires-Dist: fsspec; extra == "dropbox"
75
75
  Provides-Extra: duckdb
76
76
  Requires-Dist: duckdb; extra == "duckdb"
77
77
  Provides-Extra: elasticsearch
@@ -90,12 +90,12 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
90
90
  Provides-Extra: epub
91
91
  Requires-Dist: unstructured[epub]; extra == "epub"
92
92
  Provides-Extra: gcs
93
- Requires-Dist: fsspec; extra == "gcs"
94
93
  Requires-Dist: gcsfs; extra == "gcs"
95
94
  Requires-Dist: bs4; extra == "gcs"
95
+ Requires-Dist: fsspec; extra == "gcs"
96
96
  Provides-Extra: github
97
- Requires-Dist: requests; extra == "github"
98
97
  Requires-Dist: pygithub>1.58.0; extra == "github"
98
+ Requires-Dist: requests; extra == "github"
99
99
  Provides-Extra: gitlab
100
100
  Requires-Dist: python-gitlab; extra == "gitlab"
101
101
  Provides-Extra: google-drive
@@ -121,19 +121,19 @@ Provides-Extra: msg
121
121
  Requires-Dist: unstructured[msg]; extra == "msg"
122
122
  Provides-Extra: neo4j
123
123
  Requires-Dist: neo4j; extra == "neo4j"
124
- Requires-Dist: cymple; extra == "neo4j"
125
124
  Requires-Dist: networkx; extra == "neo4j"
125
+ Requires-Dist: cymple; extra == "neo4j"
126
126
  Provides-Extra: notion
127
127
  Requires-Dist: backoff; extra == "notion"
128
- Requires-Dist: notion-client; extra == "notion"
129
- Requires-Dist: httpx; extra == "notion"
130
128
  Requires-Dist: htmlBuilder; extra == "notion"
129
+ Requires-Dist: httpx; extra == "notion"
130
+ Requires-Dist: notion-client; extra == "notion"
131
131
  Provides-Extra: odt
132
132
  Requires-Dist: unstructured[odt]; extra == "odt"
133
133
  Provides-Extra: onedrive
134
- Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
135
- Requires-Dist: msal; extra == "onedrive"
136
134
  Requires-Dist: bs4; extra == "onedrive"
135
+ Requires-Dist: msal; extra == "onedrive"
136
+ Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
137
137
  Provides-Extra: openai
138
138
  Requires-Dist: openai; extra == "openai"
139
139
  Requires-Dist: tiktoken; extra == "openai"
@@ -142,8 +142,8 @@ Requires-Dist: opensearch-py; extra == "opensearch"
142
142
  Provides-Extra: org
143
143
  Requires-Dist: unstructured[org]; extra == "org"
144
144
  Provides-Extra: outlook
145
- Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
146
145
  Requires-Dist: msal; extra == "outlook"
146
+ Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
147
147
  Provides-Extra: pdf
148
148
  Requires-Dist: unstructured[pdf]; extra == "pdf"
149
149
  Provides-Extra: pinecone
@@ -158,6 +158,8 @@ Provides-Extra: qdrant
158
158
  Requires-Dist: qdrant-client; extra == "qdrant"
159
159
  Provides-Extra: reddit
160
160
  Requires-Dist: praw; extra == "reddit"
161
+ Provides-Extra: redis
162
+ Requires-Dist: redis; extra == "redis"
161
163
  Provides-Extra: remote
162
164
  Requires-Dist: unstructured-client>=0.26.1; extra == "remote"
163
165
  Provides-Extra: rst
@@ -165,16 +167,16 @@ Requires-Dist: unstructured[rst]; extra == "rst"
165
167
  Provides-Extra: rtf
166
168
  Requires-Dist: unstructured[rtf]; extra == "rtf"
167
169
  Provides-Extra: s3
168
- Requires-Dist: fsspec; extra == "s3"
169
170
  Requires-Dist: s3fs; extra == "s3"
171
+ Requires-Dist: fsspec; extra == "s3"
170
172
  Provides-Extra: salesforce
171
173
  Requires-Dist: simple-salesforce; extra == "salesforce"
172
174
  Provides-Extra: sftp
173
- Requires-Dist: fsspec; extra == "sftp"
174
175
  Requires-Dist: paramiko; extra == "sftp"
176
+ Requires-Dist: fsspec; extra == "sftp"
175
177
  Provides-Extra: sharepoint
176
- Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
177
178
  Requires-Dist: msal; extra == "sharepoint"
179
+ Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
178
180
  Provides-Extra: singlestore
179
181
  Requires-Dist: singlestoredb; extra == "singlestore"
180
182
  Provides-Extra: slack
@@ -187,6 +189,8 @@ Requires-Dist: together; extra == "togetherai"
187
189
  Provides-Extra: tsv
188
190
  Requires-Dist: unstructured[tsv]; extra == "tsv"
189
191
  Provides-Extra: vectara
192
+ Requires-Dist: httpx; extra == "vectara"
193
+ Requires-Dist: aiofiles; extra == "vectara"
190
194
  Requires-Dist: requests; extra == "vectara"
191
195
  Provides-Extra: weaviate
192
196
  Requires-Dist: weaviate-client; extra == "weaviate"
@@ -10,15 +10,16 @@ test/integration/connectors/test_azure_ai_search.py,sha256=EGV-G_Lq3h6pHhhmmQGWj
10
10
  test/integration/connectors/test_chroma.py,sha256=KQCzBJsOHAOtg0Ehp0tNtuYchFtiSmhHDKyOju33kJg,3686
11
11
  test/integration/connectors/test_confluence.py,sha256=adJxIggjuO-jgMimBZdv_AqWeBFlQoodELucIYwWC98,3546
12
12
  test/integration/connectors/test_delta_table.py,sha256=xsnJmwlWVQrccYeAtpt2lm0DYm2jGxiKXeERQXqCDCM,6884
13
- test/integration/connectors/test_kafka.py,sha256=FtHLptvS9V3Br7wCm2Xyh_ulz8_wWvCOKKEd0xD9LyM,10758
14
13
  test/integration/connectors/test_lancedb.py,sha256=U2HfIrf6iJ7lYMn-vz0j-LesVyDY-jc9QrQhlJVhG9Q,9183
15
- test/integration/connectors/test_milvus.py,sha256=abYQOjF8grEFj3FB1_wQgFSbWPFWfZ2pEsgKarfKJE4,6574
14
+ test/integration/connectors/test_milvus.py,sha256=aRT5SpJHY4NA8pG_LcVTJwYwvLw2W_OOE-NIfDq03SE,7015
16
15
  test/integration/connectors/test_mongodb.py,sha256=UZ4eo61MisCw4s0p7HWaediN7M-lSddMDs71RFgdmJs,12347
17
16
  test/integration/connectors/test_neo4j.py,sha256=Esiq_Z9k1JLrWNXPmLBsX3LLwyEozwKoxX7iwMEJjRM,8252
18
- test/integration/connectors/test_onedrive.py,sha256=KIkBwKh1hnv203VCL2UABnDkS_bP4NxOFm1AL8EPGLA,3554
17
+ test/integration/connectors/test_onedrive.py,sha256=Bp9Ayv59JnfsjSwqbQ-zYvg-XAPGgZfKJ45Asc0y1bM,3808
19
18
  test/integration/connectors/test_pinecone.py,sha256=suPFi40d6rHXurQQLIpCzW5XRTdgzlP-f-KLPhGCUHo,10208
20
19
  test/integration/connectors/test_qdrant.py,sha256=hyuqSJDaylkQVxWh7byD8jo8bwPuBxSa8MWRD3sBu-Y,7906
20
+ test/integration/connectors/test_redis.py,sha256=Q_KAZPNE9NIoRN2UsbXtc1fe_aJg66RbSQtS3OKNpc0,4327
21
21
  test/integration/connectors/test_s3.py,sha256=PJaAwFRF2lXMQlkbv9JHpngPc6706ML7zowOlXT3TcY,7033
22
+ test/integration/connectors/test_vectara.py,sha256=_FQHbhxL3f1rLV9MrHOvcljm_4qTVf5xl-Q7MplE_xs,8688
22
23
  test/integration/connectors/databricks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
24
  test/integration/connectors/databricks/test_volumes_native.py,sha256=ig60-nCdLF0GsgJowG9eRaG28iuoYHtuf12HdK6OE1I,7764
24
25
  test/integration/connectors/duckdb/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -50,17 +51,17 @@ test/integration/connectors/weaviate/test_local.py,sha256=SK6iEwQUKiCd0X99BEk8Gl
50
51
  test/integration/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
52
  test/integration/embedders/conftest.py,sha256=B2W771RbijR7G_GybsCzRyIvOzXqzbKZdRIlNDd5AGY,334
52
53
  test/integration/embedders/test_azure_openai.py,sha256=6tFpKFBFRXD49imhhRzsvy3MPtuZ4L1PtnKyMVBRAqc,1808
53
- test/integration/embedders/test_bedrock.py,sha256=0oBRNS_DtFDGQ22Z1T3t6VOJ31PrItgvnJpqcLe9Fg4,1903
54
+ test/integration/embedders/test_bedrock.py,sha256=ZyS17PLaHOeh-ykrd71Jkgg_ext7aOadxvxVJ_4IvFE,2852
54
55
  test/integration/embedders/test_huggingface.py,sha256=0mMTOO-Nh7KB70AGs_7LLQIxMYrnSPqyihriUeqACbM,1007
55
56
  test/integration/embedders/test_mixedbread.py,sha256=RrLv8SByMNXsgrlh94RbaT-VyxZ4-DILO-OPpmOwvSI,1441
56
- test/integration/embedders/test_octoai.py,sha256=LnR0BLttamW5PGid6jFxATDAi0x7hq5iWMXurbHP6TI,1328
57
- test/integration/embedders/test_openai.py,sha256=0jlFqEeeCneIWX9tGyC3TXeUNqsMXR7u5n7uEIaAQKo,1328
58
- test/integration/embedders/test_togetherai.py,sha256=0W1ScD5yb1D9hPC2ewUsuCHLUOpCuM083YMBhqAI9fw,1395
57
+ test/integration/embedders/test_octoai.py,sha256=oQYpYh2XaKhiqtnOSpH0rP9TQrzykZ1-3C3jZRurPu8,1734
58
+ test/integration/embedders/test_openai.py,sha256=s4_XGQfVpsTb4hKh2QZkXdOG_MnF5OQgL98kzNjTFCg,1664
59
+ test/integration/embedders/test_togetherai.py,sha256=3otyr6i9smJMyXbhKCcaC2gx813rqGaZTKi2sEM7GIQ,1707
59
60
  test/integration/embedders/test_vertexai.py,sha256=OtoFzmrWWhGIO5Bbl5zt_4sp6qRHZxtaDQKpGcfzNLM,1345
60
61
  test/integration/embedders/test_voyageai.py,sha256=Zqf7nn1AxfBDBr5A9Jr-5pxes4QNvfKiyeGexCCm4nY,1346
61
62
  test/integration/embedders/utils.py,sha256=3AMKMBpgBep_0jFqrqMHH8BJo6w60kpouSZ5JPJTwIA,1850
62
63
  test/integration/partitioners/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
- test/integration/partitioners/test_partitioner.py,sha256=KEpnhsz2YNAoQ2UZGOTsi1_uk1h4Vg-gGTsy5Fe9OCw,2846
64
+ test/integration/partitioners/test_partitioner.py,sha256=MEQJbRoc01uPLT6O8CkXeQF_DXK21nz3KVJkzkBtsgM,2835
64
65
  test/unit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
65
66
  test/unit/test_chunking_utils.py,sha256=0iPwfnMPpyTm-yOE0BXMnEQQP4iguS6NhOqgMQU5nhk,1390
66
67
  test/unit/test_error.py,sha256=RflmngCdFNKOLXVfLnUdNfY3Mfg3k7DTEzfIl0B-syU,840
@@ -69,7 +70,7 @@ test/unit/test_logger.py,sha256=0SKndXE_VRd8XmUHkrj7zuBQHZscXx3ZQllMEOvtF9Y,2380
69
70
  test/unit/test_utils.py,sha256=Q6mp9YZPah8z3-2lreyRbmAc7m2Y_w26_N9vocSInoA,5421
70
71
  test/unit/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
72
  test/unit/embed/test_mixedbreadai.py,sha256=XFNJDP5pIgF3eQYwBiuEWmH3zZWx72Wpwyv-Q4m0DJg,1332
72
- test/unit/embed/test_octoai.py,sha256=Ha9EgAW64Q45hFj51tToe8RyKXWXwqAkdDqSFDMu37Q,831
73
+ test/unit/embed/test_octoai.py,sha256=pouR4J6B_mrlu4TsA5yr2Ln_LCYL2pGBojXY5KEqvKI,1053
73
74
  test/unit/embed/test_openai.py,sha256=0O1yshDcE0BMKv1yJqrNuiNLSdPhLpKqJ-D_wmnidsM,831
74
75
  test/unit/embed/test_vertexai.py,sha256=Pl7COc9E3tf_yGidkTEmTizNGyZF1F5zuL2TgPTMnfI,1048
75
76
  test/unit/embed/test_voyageai.py,sha256=DviCOJFhe5H4e26-kNyX3JNe8h3qB5Yl0KOe8rQEMrc,981
@@ -94,7 +95,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
94
95
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
95
96
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
96
97
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
97
- unstructured_ingest/__version__.py,sha256=CJalz6YpEm8DAhzCP5dryU5ddzKQaSQOzTObKxfOVHs,43
98
+ unstructured_ingest/__version__.py,sha256=R522TM0FvpKddIRo55tqz-j1ENS8k4uXjk60bKhQ50M,43
98
99
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
99
100
  unstructured_ingest/interfaces.py,sha256=OYVUP0bzBJpT-Lz92BDyz_hLBvyfxkuSwWHhUdnUayA,31493
100
101
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -263,15 +264,15 @@ unstructured_ingest/connector/notion/types/database_properties/url.py,sha256=iXQ
263
264
  unstructured_ingest/connector/notion/types/database_properties/verification.py,sha256=J_DLjY-v2T6xDGMQ7FkI0YMKMA6SG6Y3yYW7qUD1hKA,2334
264
265
  unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
265
266
  unstructured_ingest/embed/azure_openai.py,sha256=4YBOIxv66wVZ5EqNNC4uCDPNJ3VrsLPe5wwagT6zqe0,1001
266
- unstructured_ingest/embed/bedrock.py,sha256=-PRdZsF44vwi6G4G75gdO31AJKfZWClOXkJQAk7rEO8,3096
267
+ unstructured_ingest/embed/bedrock.py,sha256=qb1eo1Uooz2JxhHcqvJDTYGbuwaqyRTD0ZepQzlL9_o,4455
267
268
  unstructured_ingest/embed/huggingface.py,sha256=2cBiQhOhfWHX3hS-eKjocysOkUaRlyRfUj9Kxjrp6cE,1934
268
- unstructured_ingest/embed/interfaces.py,sha256=au4Xp8ciDvo4bidlUbazFW2aC7NZW5-UDLKXBFVzAX4,2025
269
+ unstructured_ingest/embed/interfaces.py,sha256=XsPtb53367KCkH-ItwWQ_EQ-sYWHaekhxkF4PwHCNXc,2210
269
270
  unstructured_ingest/embed/mixedbreadai.py,sha256=OwFWWukvkQaXhjgs6b6N6D4w7sYrtcHNhsHAj-Bocj4,4268
270
- unstructured_ingest/embed/octoai.py,sha256=jHytDfQgup0v1PBcmlMv1nIh9Obg8WGO5qtLmN-Ot5g,1473
271
- unstructured_ingest/embed/openai.py,sha256=JXo4boivNoo2lBzHuS4Z0FZ1zlgUGAPVt0X3HY540ZU,1282
272
- unstructured_ingest/embed/togetherai.py,sha256=BL7NzExSE-laQqrp4ybUgoZ9JG_eop4hk-s2yCO_d5c,1451
271
+ unstructured_ingest/embed/octoai.py,sha256=0LVZlbOMUuxwZV0QHhGWUlneWDX3fCklPRTuc4huze0,3007
272
+ unstructured_ingest/embed/openai.py,sha256=5M2idJ7Ynx_3-FXwm9mTjGnNiww0DSuZmbuvi2YAUqk,2543
273
+ unstructured_ingest/embed/togetherai.py,sha256=2jXYFB9QTDUlSKc_j32bMrwKu7YQA0oF893rGSmlXr8,2374
273
274
  unstructured_ingest/embed/vertexai.py,sha256=X5bGJdXyR5nAFH_ocAVgEowmd60nOBykyfclYo3VfBM,2808
274
- unstructured_ingest/embed/voyageai.py,sha256=bjom9QqWmH1Mv08ewg8ZG7gO3rQPMVS0_ztm2KBAOjI,1821
275
+ unstructured_ingest/embed/voyageai.py,sha256=BfYa-oedkq-56j5_0rDjOLy18b9zC0zagaoPHJry5xA,2958
275
276
  unstructured_ingest/enhanced_dataclass/__init__.py,sha256=gDZOUsv5eo-8jm4Yu7DdDwi101aGbfG7JctTdOYnTOM,151
276
277
  unstructured_ingest/enhanced_dataclass/core.py,sha256=d6aUkDynuKX87cHx9_N5UDUWrvISR4jYRFRTvd_avlI,3038
277
278
  unstructured_ingest/enhanced_dataclass/dataclasses.py,sha256=aZMsoCzAGRb8Rmh3BTSBFtNr6FmFTY93KYGLk3gYJKQ,1949
@@ -360,10 +361,11 @@ unstructured_ingest/utils/string_and_date_utils.py,sha256=kijtPlGAbH376vVjFSo5H_
360
361
  unstructured_ingest/utils/table.py,sha256=aWjcowDVSClNpEAdR6PY3H7khKu4T6T3QqQE6GjmQ_M,3469
361
362
  unstructured_ingest/v2/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
362
363
  unstructured_ingest/v2/constants.py,sha256=pDspTYz-nEojHBqrZNfssGEiujmVa02pIWL63PQP9sU,103
364
+ unstructured_ingest/v2/errors.py,sha256=y1tGvobuhQdcR9vw5APuFigiQSfsQKrAYGDr4biGDdw,207
363
365
  unstructured_ingest/v2/logger.py,sha256=wcln4s5Nyp2fjjJux9iM3d6t9aQFNJ2H1IAZXmIknjI,4323
364
366
  unstructured_ingest/v2/main.py,sha256=WFdLEqEXRy6E9_G-dF20MK2AtgX51Aan1sp_N67U2B8,172
365
367
  unstructured_ingest/v2/otel.py,sha256=2fGj1c7cVcC3J8NwL6MNYhyPEAXiB33DsilvRDkrdLo,4130
366
- unstructured_ingest/v2/unstructured_api.py,sha256=f_6NK0QOVwjAFJvlyvzu0IaXb6QQgRNJleYxB1KvzKE,3856
368
+ unstructured_ingest/v2/unstructured_api.py,sha256=g6AO2Vy0lpy6-ooOvdgfJvIRhearPKArp3ggIdApG8I,4514
367
369
  unstructured_ingest/v2/utils.py,sha256=HHli5rHDBm6flUeQ_ovVDvtOdnzzL4FvNyw6jsHIJfw,2041
368
370
  unstructured_ingest/v2/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
369
371
  unstructured_ingest/v2/cli/cli.py,sha256=qHXIs-PcvMgDZhP1AR9iDMxh8FXBMJCEDksPBfiMULE,648
@@ -406,11 +408,11 @@ unstructured_ingest/v2/processes/embedder.py,sha256=xCBpaL07WnVUOUW8SHktaf1vwBGZ
406
408
  unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
407
409
  unstructured_ingest/v2/processes/partitioner.py,sha256=agpHwB9FR8OZVQqE7zFEb0IcDPCOPA_BZjLzLF71nOY,8194
408
410
  unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
409
- unstructured_ingest/v2/processes/connectors/__init__.py,sha256=CTWLEmaKLTjbqeUQGI0fxJobsqDOc1d2ZKJoXh98Lww,5432
411
+ unstructured_ingest/v2/processes/connectors/__init__.py,sha256=bmogp1sPbRS-RndN0R8V8gY4uaTkpmNJv-035-Y5SGU,5835
410
412
  unstructured_ingest/v2/processes/connectors/airtable.py,sha256=eeZJe-bBNxt5Sa-XEFCdcGeJCguJU5WN2Mv9kLp5dVQ,8917
411
413
  unstructured_ingest/v2/processes/connectors/astradb.py,sha256=xhUMoUdnrfAY1isZGqsV4lZUsnZNpbvgLyQWQbR4hVo,14814
412
414
  unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=ngPDpU0oZ6m5sxIlB6u5ebQpqCS_SJ-_amCC1KQ03EQ,11529
413
- unstructured_ingest/v2/processes/connectors/chroma.py,sha256=G1DQHhhFQCS2RLF0cVvoUH9QO8KkVjIyNZ9nKh__aHw,7220
415
+ unstructured_ingest/v2/processes/connectors/chroma.py,sha256=VHCnM56qNXuHzovJihrNfJnZbWLJShOe8j12PJFrbL0,7219
414
416
  unstructured_ingest/v2/processes/connectors/confluence.py,sha256=-Y1OU_ZXhZQNj5NH3EN01CP8QKKZJaJ9xkXoAlSgnIk,7604
415
417
  unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=i7vuNKsUkN93JRVmg4--MO0ZgbjvhIqt46oYqk9zFSQ,12250
416
418
  unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=SotSXZQ85_6TO906YvFi3yTml8jE9A_zV6nBJ4oTx8A,7075
@@ -418,16 +420,18 @@ unstructured_ingest/v2/processes/connectors/gitlab.py,sha256=ufE65Z8q_tC4oppGg5B
418
420
  unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=5k7pdAzJGXSdyPCzW9vu2OaAjGVTo2JevDyGaXM1Hvk,13370
419
421
  unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=VRDAiou_7oWOIAgQTdOGQWxudzQEDopXM8XkfkQ2j6g,5004
420
422
  unstructured_ingest/v2/processes/connectors/local.py,sha256=ZvWTj6ZYkwnvQMNFsZWoaQyp9zp0WVqAywMaHJ2kcAc,7153
421
- unstructured_ingest/v2/processes/connectors/milvus.py,sha256=I57hyH5nz_p7utmUOkvt_6vCPxNIVQMoukplUgIyYi8,8503
423
+ unstructured_ingest/v2/processes/connectors/milvus.py,sha256=wmcu9NVy3gYlQGT25inN5w_QrhFoL8-hRq0pJFSNw8g,8866
422
424
  unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=cL0QUQZF_s2brh3nNNeAywXVpaIiND4b5JTAFlYjLjw,14273
423
- unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=lRtWe6xWYogT-y_r_o7HWvlFMf_OIPGQq_Z-5v7IOq0,14163
424
- unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=H8qk53YJXAPrPyISze0dybZdDFv5B7dVO3fIr10dVU8,15982
425
+ unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=QTw_Kq1_kHMdqsaBST6yW8vl-SYXVQFlIofDP1W_IuI,14250
426
+ unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=d6gC40YmfqBNXxizAt4MO4OOu5BoCZ7SAe1AbNwTP0E,18322
425
427
  unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
426
428
  unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=cohF7gBj0opSGKXlENSdGfTtyIKMHd1pwu4ydeb7JAY,10605
429
+ unstructured_ingest/v2/processes/connectors/redisdb.py,sha256=p0AY4ukBNpwAemV4bWzpScvVbLTVlI3DzsCNUKiBI5M,6757
427
430
  unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
428
431
  unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=Ndn2Wm7RupfjAtlLxxQwJueeE0V8aGMbNVPuFq9nqdQ,19730
429
432
  unstructured_ingest/v2/processes/connectors/slack.py,sha256=Z73VmQ3oUY09KoLEi5OBdQeDt4ONEY_02SglWQc6HXE,9252
430
433
  unstructured_ingest/v2/processes/connectors/utils.py,sha256=8kd0g7lo9NqnpaIkjeO-Ut6erhwUNH_gS9koevpe3WE,878
434
+ unstructured_ingest/v2/processes/connectors/vectara.py,sha256=BlI_4nkpNR99aYxDd9eusm5LQsVB9EI0r-5Kc1D7pgQ,12255
431
435
  unstructured_ingest/v2/processes/connectors/databricks/__init__.py,sha256=jO71UTC7bLA_N12CrLWJzh_yZML5gfT7VohxzCpUGWg,1848
432
436
  unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=kI_ThB5e-DS8-GiQP5TQ8cP3fiGRm-V2AuNlGoSjH6I,6613
433
437
  unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=TA2e_1SIr4VaEI62873eyReCNfgmQ51_2Pko2I04pPM,2747
@@ -451,8 +455,8 @@ unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=l3TRKPEb0AJ7e0VS
451
455
  unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=JsxXKXNI20mdgwR_A6Rnf4u8fsFwLe3AkJmIe_3NEKY,6150
452
456
  unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
453
457
  unstructured_ingest/v2/processes/connectors/kafka/__init__.py,sha256=mQJ9Ex-QCfhz-BB5YWTfbPf7xGLd1i7FpjRr0ukbhNw,754
454
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py,sha256=1SqNdY8Q8JwwB57wk9efxKv_BCeUkxZJ2HJ526wuCMw,3294
455
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py,sha256=a-LWqYeJAK-g32UPgvvDt6W7dJp85N66aR_EKSR66RU,9685
458
+ unstructured_ingest/v2/processes/connectors/kafka/cloud.py,sha256=GdAeQ8Uz-6v1C5byBHtjfevVfbzW3obScBFFLRTb0ps,3441
459
+ unstructured_ingest/v2/processes/connectors/kafka/kafka.py,sha256=UfS41jzV9VxekS6AwWHhURJmJ7RUAw5iiIrj75BWrXQ,10255
456
460
  unstructured_ingest/v2/processes/connectors/kafka/local.py,sha256=lUkmfbTxyQW87CXxbJaijIT6foV09Gi-IG9o08OgiEs,2581
457
461
  unstructured_ingest/v2/processes/connectors/lancedb/__init__.py,sha256=LW37xZrn48JeHluRNulLTreUPdaF-ZU81F7MCUHcCv8,1253
458
462
  unstructured_ingest/v2/processes/connectors/lancedb/aws.py,sha256=eeXWsh8UeVm1Ur53C4MEnpLplfO8U91KYgk--0kk5pE,1413
@@ -464,7 +468,7 @@ unstructured_ingest/v2/processes/connectors/lancedb/local.py,sha256=_7-6iO6B60gA
464
468
  unstructured_ingest/v2/processes/connectors/qdrant/__init__.py,sha256=xM19uYzAuGizVoZIM_hnVZ5AcBN69aOBGpqZcpWPtuE,760
465
469
  unstructured_ingest/v2/processes/connectors/qdrant/cloud.py,sha256=accJ4sNWBVWV-KiVBDBDBYYx5A9CUoikP5NCErRmfik,1624
466
470
  unstructured_ingest/v2/processes/connectors/qdrant/local.py,sha256=cGEyv3Oy6y4BQ4DU8yhJWMpL82QYwBVdPTxxNuV127U,1588
467
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py,sha256=ITRYXKYEFhlagSe-AKKGRvC8jzyWmhQLfHbFb0ax8o8,5438
471
+ unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py,sha256=BHI7HYSdbS05j2vrjyDvLzVG1WfsM8osKeq-lttlybQ,5437
468
472
  unstructured_ingest/v2/processes/connectors/qdrant/server.py,sha256=odvCZWZp8DmRxLXMR7tHhW-c7UQbix1_zpFdfXfCvKI,1613
469
473
  unstructured_ingest/v2/processes/connectors/sql/__init__.py,sha256=E16CXRBw8fZKTuXIECns5wif_I07oncBHskVxHC4p7w,1448
470
474
  unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=BATfX1PQGT2kl8jAbdNKXTojYKJxh3pJV9-h3OBnHGo,5124
@@ -477,9 +481,9 @@ unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-
477
481
  unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
478
482
  unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
479
483
  unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=X1yv1H_orDQ-J965EMXhR2XaURqe8vovSi9n1fk85B4,10499
480
- unstructured_ingest-0.3.11.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
481
- unstructured_ingest-0.3.11.dist-info/METADATA,sha256=4-py_Sf-ahdzXF0l1evY4aI_s5Vz4oc6Gtenhegc6Vo,7623
482
- unstructured_ingest-0.3.11.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
483
- unstructured_ingest-0.3.11.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
484
- unstructured_ingest-0.3.11.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
485
- unstructured_ingest-0.3.11.dist-info/RECORD,,
484
+ unstructured_ingest-0.3.12.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
485
+ unstructured_ingest-0.3.12.dist-info/METADATA,sha256=nNPregI5d4D8fHqXxTPkKmn7bqmfUX5RB-AcMsgj0J4,7769
486
+ unstructured_ingest-0.3.12.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
487
+ unstructured_ingest-0.3.12.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
488
+ unstructured_ingest-0.3.12.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
489
+ unstructured_ingest-0.3.12.dist-info/RECORD,,
@@ -1,304 +0,0 @@
1
- import json
2
- import os
3
- import tempfile
4
- import time
5
- from pathlib import Path
6
-
7
- import pytest
8
- from confluent_kafka import Consumer, KafkaError, KafkaException, Producer
9
- from confluent_kafka.admin import AdminClient, NewTopic
10
-
11
- from test.integration.connectors.utils.constants import (
12
- DESTINATION_TAG,
13
- SOURCE_TAG,
14
- env_setup_path,
15
- )
16
- from test.integration.connectors.utils.docker_compose import docker_compose_context
17
- from test.integration.connectors.utils.validation.source import (
18
- SourceValidationConfigs,
19
- source_connector_validation,
20
- )
21
- from test.integration.utils import requires_env
22
- from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
23
- from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
24
- from unstructured_ingest.v2.processes.connectors.kafka.cloud import (
25
- CloudKafkaAccessConfig,
26
- CloudKafkaConnectionConfig,
27
- CloudKafkaDownloader,
28
- CloudKafkaDownloaderConfig,
29
- CloudKafkaIndexer,
30
- CloudKafkaIndexerConfig,
31
- )
32
- from unstructured_ingest.v2.processes.connectors.kafka.local import (
33
- CONNECTOR_TYPE,
34
- LocalKafkaConnectionConfig,
35
- LocalKafkaDownloader,
36
- LocalKafkaDownloaderConfig,
37
- LocalKafkaIndexer,
38
- LocalKafkaIndexerConfig,
39
- LocalKafkaUploader,
40
- LocalKafkaUploaderConfig,
41
- )
42
-
43
- SEED_MESSAGES = 10
44
- TOPIC = "fake-topic"
45
-
46
-
47
- def get_admin_client() -> AdminClient:
48
- conf = {
49
- "bootstrap.servers": "localhost:29092",
50
- }
51
- return AdminClient(conf)
52
-
53
-
54
- @pytest.fixture
55
- def docker_compose_ctx():
56
- with docker_compose_context(docker_compose_path=env_setup_path / "kafka") as ctx:
57
- yield ctx
58
-
59
-
60
- def wait_for_topic(
61
- topic: str,
62
- retries: int = 10,
63
- interval: int = 1,
64
- exists: bool = True,
65
- admin_client=None,
66
- ):
67
- if admin_client is None:
68
- admin_client = get_admin_client()
69
- current_topics = admin_client.list_topics().topics
70
- attempts = 0
71
- while (topic not in current_topics) == exists and attempts < retries:
72
- attempts += 1
73
- print(
74
- "Attempt {}: Waiting for topic {} to {} exist. Current topics: [{}]".format(
75
- attempts, topic, "" if exists else "not", ", ".join(current_topics)
76
- )
77
- )
78
- time.sleep(interval)
79
- current_topics = admin_client.list_topics().topics
80
- if (topic not in current_topics) == exists:
81
- raise TimeoutError(f"Timeout out waiting for topic {topic} to exist")
82
-
83
-
84
- @pytest.fixture
85
- def kafka_seed_topic(docker_compose_ctx) -> str:
86
- conf = {
87
- "bootstrap.servers": "localhost:29092",
88
- }
89
- producer = Producer(conf)
90
- for i in range(SEED_MESSAGES):
91
- message = f"This is some text for message {i}"
92
- producer.produce(topic=TOPIC, value=message)
93
- producer.flush(timeout=10)
94
- print(f"kafka topic {TOPIC} seeded with {SEED_MESSAGES} messages")
95
- wait_for_topic(topic=TOPIC)
96
- return TOPIC
97
-
98
-
99
- @pytest.fixture
100
- def kafka_upload_topic(docker_compose_ctx) -> str:
101
- admin_client = get_admin_client()
102
- admin_client.create_topics([NewTopic(TOPIC, 1, 1)])
103
- return TOPIC
104
-
105
-
106
- @pytest.mark.asyncio
107
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
108
- async def test_kafka_source_local(kafka_seed_topic: str):
109
- connection_config = LocalKafkaConnectionConfig(bootstrap_server="localhost", port=29092)
110
- with tempfile.TemporaryDirectory() as tempdir:
111
- tempdir_path = Path(tempdir)
112
- download_config = LocalKafkaDownloaderConfig(download_dir=tempdir_path)
113
- indexer = LocalKafkaIndexer(
114
- connection_config=connection_config,
115
- index_config=LocalKafkaIndexerConfig(topic=kafka_seed_topic, num_messages_to_consume=5),
116
- )
117
- downloader = LocalKafkaDownloader(
118
- connection_config=connection_config, download_config=download_config
119
- )
120
- indexer.precheck()
121
- await source_connector_validation(
122
- indexer=indexer,
123
- downloader=downloader,
124
- configs=SourceValidationConfigs(
125
- test_id="kafka-local", expected_num_files=5, validate_downloaded_files=True
126
- ),
127
- )
128
-
129
-
130
- @pytest.fixture
131
- def kafka_seed_topic_cloud(expected_messages: int = 5) -> int:
132
- conf = {
133
- "bootstrap.servers": os.environ["KAFKA_BOOTSTRAP_SERVER"],
134
- "sasl.username": os.environ["KAFKA_API_KEY"],
135
- "sasl.password": os.environ["KAFKA_SECRET"],
136
- "sasl.mechanism": "PLAIN",
137
- "security.protocol": "SASL_SSL",
138
- }
139
- admin_client = AdminClient(conf)
140
- try:
141
- res = admin_client.delete_topics([TOPIC], operation_timeout=10)
142
- for topic, f in res.items():
143
- f.result()
144
- print(f"Topic {topic} removed")
145
- wait_for_topic(TOPIC, 5, 1, False, admin_client)
146
- except Exception:
147
- pass
148
-
149
- cluster_meta = admin_client.list_topics()
150
- current_topics = [topic for topic in cluster_meta.topics if topic != "__consumer_offsets"]
151
-
152
- assert TOPIC not in current_topics, f"Topic {TOPIC} shouldn't exist"
153
-
154
- # Kafka Cloud allows to use replication_factor=1 only for Dedicated clusters.
155
- topic_obj = NewTopic(TOPIC, num_partitions=1, replication_factor=3)
156
-
157
- res = admin_client.create_topics([topic_obj], operation_timeout=10, validate_only=False)
158
- for topic, f in res.items():
159
- f.result()
160
-
161
- producer = Producer(conf)
162
- for i in range(expected_messages):
163
- message = f"This is some text for message {i}"
164
- producer.produce(topic=TOPIC, value=message)
165
- producer.flush(timeout=10)
166
- return expected_messages
167
-
168
-
169
- @pytest.mark.asyncio
170
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
171
- @requires_env("KAFKA_API_KEY", "KAFKA_SECRET", "KAFKA_BOOTSTRAP_SERVER")
172
- async def test_kafka_source_cloud(kafka_seed_topic_cloud: int):
173
- """
174
- In order to have this test succeed, you need to create cluster on Confluent Cloud,
175
- and create the API key with admin privileges. By default, user account keys have it.
176
- """
177
-
178
- expected_messages = kafka_seed_topic_cloud
179
-
180
- connection_config = CloudKafkaConnectionConfig(
181
- bootstrap_server=os.environ["KAFKA_BOOTSTRAP_SERVER"],
182
- port=9092,
183
- access_config=CloudKafkaAccessConfig(
184
- kafka_api_key=os.environ["KAFKA_API_KEY"],
185
- secret=os.environ["KAFKA_SECRET"],
186
- ),
187
- )
188
-
189
- with tempfile.TemporaryDirectory() as tempdir:
190
- tempdir_path = Path(tempdir)
191
- download_config = CloudKafkaDownloaderConfig(download_dir=tempdir_path)
192
- indexer = CloudKafkaIndexer(
193
- connection_config=connection_config,
194
- index_config=CloudKafkaIndexerConfig(
195
- topic=TOPIC,
196
- num_messages_to_consume=expected_messages,
197
- ),
198
- )
199
- downloader = CloudKafkaDownloader(
200
- connection_config=connection_config, download_config=download_config
201
- )
202
- indexer.precheck()
203
- await source_connector_validation(
204
- indexer=indexer,
205
- downloader=downloader,
206
- configs=SourceValidationConfigs(
207
- test_id="kafka-cloud",
208
- exclude_fields_extend=["connector_type"],
209
- expected_num_files=expected_messages,
210
- validate_downloaded_files=True,
211
- validate_file_data=True,
212
- ),
213
- )
214
-
215
-
216
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
217
- def test_kafka_source_local_precheck_fail_no_cluster():
218
- connection_config = LocalKafkaConnectionConfig(bootstrap_server="localhost", port=29092)
219
- indexer = LocalKafkaIndexer(
220
- connection_config=connection_config,
221
- index_config=LocalKafkaIndexerConfig(topic=TOPIC, num_messages_to_consume=5),
222
- )
223
- with pytest.raises(SourceConnectionError):
224
- indexer.precheck()
225
-
226
-
227
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
228
- def test_kafka_source_local_precheck_fail_no_topic(kafka_seed_topic: str):
229
- connection_config = LocalKafkaConnectionConfig(bootstrap_server="localhost", port=29092)
230
- indexer = LocalKafkaIndexer(
231
- connection_config=connection_config,
232
- index_config=LocalKafkaIndexerConfig(topic="topic", num_messages_to_consume=5),
233
- )
234
- with pytest.raises(SourceConnectionError):
235
- indexer.precheck()
236
-
237
-
238
- def get_all_messages(topic: str, max_empty_messages: int = 5) -> list[dict]:
239
- conf = {
240
- "bootstrap.servers": "localhost:29092",
241
- "group.id": "default_group_id",
242
- "enable.auto.commit": "false",
243
- "auto.offset.reset": "earliest",
244
- }
245
- consumer = Consumer(conf)
246
- consumer.subscribe([topic])
247
- messages = []
248
- try:
249
- empty_count = 0
250
- while empty_count < max_empty_messages:
251
- msg = consumer.poll(timeout=1)
252
- if msg is None:
253
- empty_count += 1
254
- continue
255
- if msg.error():
256
- if msg.error().code() == KafkaError._PARTITION_EOF:
257
- break
258
- else:
259
- raise KafkaException(msg.error())
260
- try:
261
- message = json.loads(msg.value().decode("utf8"))
262
- messages.append(message)
263
- finally:
264
- consumer.commit(asynchronous=False)
265
- finally:
266
- print("closing consumer")
267
- consumer.close()
268
- return messages
269
-
270
-
271
- @pytest.mark.asyncio
272
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
273
- async def test_kafka_destination_local(upload_file: Path, kafka_upload_topic: str):
274
- uploader = LocalKafkaUploader(
275
- connection_config=LocalKafkaConnectionConfig(bootstrap_server="localhost", port=29092),
276
- upload_config=LocalKafkaUploaderConfig(topic=TOPIC, batch_size=10),
277
- )
278
- file_data = FileData(
279
- source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
280
- connector_type=CONNECTOR_TYPE,
281
- identifier="mock file data",
282
- )
283
- uploader.precheck()
284
- if uploader.is_async():
285
- await uploader.run_async(path=upload_file, file_data=file_data)
286
- else:
287
- uploader.run(path=upload_file, file_data=file_data)
288
- all_messages = get_all_messages(topic=kafka_upload_topic)
289
- with upload_file.open("r") as upload_fs:
290
- content_to_upload = json.load(upload_fs)
291
- assert len(all_messages) == len(content_to_upload), (
292
- f"expected number of messages ({len(content_to_upload)}) doesn't "
293
- f"match how many messages read off of kakfa topic {kafka_upload_topic}: {len(all_messages)}"
294
- )
295
-
296
-
297
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
298
- def test_kafka_destination_local_precheck_fail_no_cluster():
299
- uploader = LocalKafkaUploader(
300
- connection_config=LocalKafkaConnectionConfig(bootstrap_server="localhost", port=29092),
301
- upload_config=LocalKafkaUploaderConfig(topic=TOPIC, batch_size=10),
302
- )
303
- with pytest.raises(DestinationConnectionError):
304
- uploader.precheck()