unstructured-ingest 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (356) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/__init__.py +14 -0
  4. unstructured_ingest/cli/base/__init__.py +0 -0
  5. unstructured_ingest/cli/base/cmd.py +19 -0
  6. unstructured_ingest/cli/base/dest.py +87 -0
  7. unstructured_ingest/cli/base/src.py +57 -0
  8. unstructured_ingest/cli/cli.py +32 -0
  9. unstructured_ingest/cli/cmd_factory.py +12 -0
  10. unstructured_ingest/cli/cmds/__init__.py +145 -0
  11. unstructured_ingest/cli/cmds/airtable.py +69 -0
  12. unstructured_ingest/cli/cmds/astra.py +99 -0
  13. unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
  14. unstructured_ingest/cli/cmds/biomed.py +52 -0
  15. unstructured_ingest/cli/cmds/chroma.py +104 -0
  16. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  17. unstructured_ingest/cli/cmds/confluence.py +69 -0
  18. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  19. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  20. unstructured_ingest/cli/cmds/discord.py +47 -0
  21. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  22. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  23. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  24. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  25. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  26. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  27. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  28. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  29. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  30. unstructured_ingest/cli/cmds/github.py +54 -0
  31. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  32. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  33. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  34. unstructured_ingest/cli/cmds/jira.py +71 -0
  35. unstructured_ingest/cli/cmds/kafka.py +102 -0
  36. unstructured_ingest/cli/cmds/local.py +43 -0
  37. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  38. unstructured_ingest/cli/cmds/notion.py +48 -0
  39. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  40. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  41. unstructured_ingest/cli/cmds/outlook.py +67 -0
  42. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  43. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  44. unstructured_ingest/cli/cmds/reddit.py +67 -0
  45. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  46. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  47. unstructured_ingest/cli/cmds/slack.py +56 -0
  48. unstructured_ingest/cli/cmds/sql.py +66 -0
  49. unstructured_ingest/cli/cmds/vectara.py +66 -0
  50. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  51. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  52. unstructured_ingest/cli/common.py +7 -0
  53. unstructured_ingest/cli/interfaces.py +656 -0
  54. unstructured_ingest/cli/utils.py +205 -0
  55. unstructured_ingest/connector/__init__.py +0 -0
  56. unstructured_ingest/connector/airtable.py +309 -0
  57. unstructured_ingest/connector/astra.py +237 -0
  58. unstructured_ingest/connector/azure_cognitive_search.py +144 -0
  59. unstructured_ingest/connector/biomed.py +313 -0
  60. unstructured_ingest/connector/chroma.py +158 -0
  61. unstructured_ingest/connector/clarifai.py +122 -0
  62. unstructured_ingest/connector/confluence.py +285 -0
  63. unstructured_ingest/connector/databricks_volumes.py +137 -0
  64. unstructured_ingest/connector/delta_table.py +203 -0
  65. unstructured_ingest/connector/discord.py +180 -0
  66. unstructured_ingest/connector/elasticsearch.py +396 -0
  67. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  68. unstructured_ingest/connector/fsspec/azure.py +78 -0
  69. unstructured_ingest/connector/fsspec/box.py +109 -0
  70. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  71. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  72. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  73. unstructured_ingest/connector/fsspec/s3.py +62 -0
  74. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  75. unstructured_ingest/connector/git.py +124 -0
  76. unstructured_ingest/connector/github.py +173 -0
  77. unstructured_ingest/connector/gitlab.py +142 -0
  78. unstructured_ingest/connector/google_drive.py +349 -0
  79. unstructured_ingest/connector/hubspot.py +278 -0
  80. unstructured_ingest/connector/jira.py +469 -0
  81. unstructured_ingest/connector/kafka.py +294 -0
  82. unstructured_ingest/connector/local.py +139 -0
  83. unstructured_ingest/connector/mongodb.py +285 -0
  84. unstructured_ingest/connector/notion/__init__.py +0 -0
  85. unstructured_ingest/connector/notion/client.py +233 -0
  86. unstructured_ingest/connector/notion/connector.py +468 -0
  87. unstructured_ingest/connector/notion/helpers.py +584 -0
  88. unstructured_ingest/connector/notion/interfaces.py +32 -0
  89. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  90. unstructured_ingest/connector/notion/types/block.py +95 -0
  91. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  92. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  93. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  94. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  95. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  96. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  97. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  98. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  99. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  100. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  101. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  102. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  103. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  104. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  105. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  106. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  107. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  108. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  109. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  110. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  111. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  112. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  113. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  114. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  115. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  116. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  117. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  118. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  119. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  120. unstructured_ingest/connector/notion/types/database.py +72 -0
  121. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  122. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  123. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  124. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  125. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  126. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  127. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  128. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  129. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  130. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  131. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  132. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  133. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  134. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  135. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  136. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  137. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  138. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  139. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  140. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  141. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  142. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  143. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  144. unstructured_ingest/connector/notion/types/date.py +26 -0
  145. unstructured_ingest/connector/notion/types/file.py +51 -0
  146. unstructured_ingest/connector/notion/types/page.py +44 -0
  147. unstructured_ingest/connector/notion/types/parent.py +66 -0
  148. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  149. unstructured_ingest/connector/notion/types/user.py +76 -0
  150. unstructured_ingest/connector/onedrive.py +232 -0
  151. unstructured_ingest/connector/opensearch.py +218 -0
  152. unstructured_ingest/connector/outlook.py +285 -0
  153. unstructured_ingest/connector/pinecone.py +140 -0
  154. unstructured_ingest/connector/qdrant.py +144 -0
  155. unstructured_ingest/connector/reddit.py +166 -0
  156. unstructured_ingest/connector/registry.py +109 -0
  157. unstructured_ingest/connector/salesforce.py +301 -0
  158. unstructured_ingest/connector/sharepoint.py +573 -0
  159. unstructured_ingest/connector/slack.py +224 -0
  160. unstructured_ingest/connector/sql.py +199 -0
  161. unstructured_ingest/connector/vectara.py +248 -0
  162. unstructured_ingest/connector/weaviate.py +190 -0
  163. unstructured_ingest/connector/wikipedia.py +208 -0
  164. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  165. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  166. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  167. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  168. unstructured_ingest/error.py +49 -0
  169. unstructured_ingest/evaluate.py +338 -0
  170. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  171. unstructured_ingest/ingest_backoff/_common.py +102 -0
  172. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  173. unstructured_ingest/interfaces.py +838 -0
  174. unstructured_ingest/logger.py +130 -0
  175. unstructured_ingest/main.py +11 -0
  176. unstructured_ingest/pipeline/__init__.py +22 -0
  177. unstructured_ingest/pipeline/copy.py +19 -0
  178. unstructured_ingest/pipeline/doc_factory.py +12 -0
  179. unstructured_ingest/pipeline/interfaces.py +265 -0
  180. unstructured_ingest/pipeline/partition.py +60 -0
  181. unstructured_ingest/pipeline/permissions.py +12 -0
  182. unstructured_ingest/pipeline/pipeline.py +117 -0
  183. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  184. unstructured_ingest/pipeline/reformat/chunking.py +130 -0
  185. unstructured_ingest/pipeline/reformat/embedding.py +66 -0
  186. unstructured_ingest/pipeline/source.py +77 -0
  187. unstructured_ingest/pipeline/utils.py +6 -0
  188. unstructured_ingest/pipeline/write.py +18 -0
  189. unstructured_ingest/processor.py +93 -0
  190. unstructured_ingest/runner/__init__.py +104 -0
  191. unstructured_ingest/runner/airtable.py +35 -0
  192. unstructured_ingest/runner/astra.py +34 -0
  193. unstructured_ingest/runner/base_runner.py +89 -0
  194. unstructured_ingest/runner/biomed.py +45 -0
  195. unstructured_ingest/runner/confluence.py +35 -0
  196. unstructured_ingest/runner/delta_table.py +34 -0
  197. unstructured_ingest/runner/discord.py +35 -0
  198. unstructured_ingest/runner/elasticsearch.py +40 -0
  199. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  200. unstructured_ingest/runner/fsspec/azure.py +30 -0
  201. unstructured_ingest/runner/fsspec/box.py +28 -0
  202. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  203. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  204. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  205. unstructured_ingest/runner/fsspec/s3.py +28 -0
  206. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  207. unstructured_ingest/runner/github.py +37 -0
  208. unstructured_ingest/runner/gitlab.py +37 -0
  209. unstructured_ingest/runner/google_drive.py +35 -0
  210. unstructured_ingest/runner/hubspot.py +35 -0
  211. unstructured_ingest/runner/jira.py +35 -0
  212. unstructured_ingest/runner/kafka.py +34 -0
  213. unstructured_ingest/runner/local.py +23 -0
  214. unstructured_ingest/runner/mongodb.py +34 -0
  215. unstructured_ingest/runner/notion.py +61 -0
  216. unstructured_ingest/runner/onedrive.py +35 -0
  217. unstructured_ingest/runner/opensearch.py +40 -0
  218. unstructured_ingest/runner/outlook.py +33 -0
  219. unstructured_ingest/runner/reddit.py +35 -0
  220. unstructured_ingest/runner/salesforce.py +33 -0
  221. unstructured_ingest/runner/sharepoint.py +35 -0
  222. unstructured_ingest/runner/slack.py +33 -0
  223. unstructured_ingest/runner/utils.py +47 -0
  224. unstructured_ingest/runner/wikipedia.py +35 -0
  225. unstructured_ingest/runner/writers/__init__.py +48 -0
  226. unstructured_ingest/runner/writers/astra.py +22 -0
  227. unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
  228. unstructured_ingest/runner/writers/base_writer.py +26 -0
  229. unstructured_ingest/runner/writers/chroma.py +22 -0
  230. unstructured_ingest/runner/writers/clarifai.py +19 -0
  231. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  232. unstructured_ingest/runner/writers/delta_table.py +24 -0
  233. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  234. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  235. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  236. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  237. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  238. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  239. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  240. unstructured_ingest/runner/writers/kafka.py +21 -0
  241. unstructured_ingest/runner/writers/mongodb.py +21 -0
  242. unstructured_ingest/runner/writers/opensearch.py +26 -0
  243. unstructured_ingest/runner/writers/pinecone.py +21 -0
  244. unstructured_ingest/runner/writers/qdrant.py +19 -0
  245. unstructured_ingest/runner/writers/sql.py +22 -0
  246. unstructured_ingest/runner/writers/vectara.py +22 -0
  247. unstructured_ingest/runner/writers/weaviate.py +21 -0
  248. unstructured_ingest/utils/__init__.py +0 -0
  249. unstructured_ingest/utils/compression.py +117 -0
  250. unstructured_ingest/utils/data_prep.py +112 -0
  251. unstructured_ingest/utils/dep_check.py +66 -0
  252. unstructured_ingest/utils/string_and_date_utils.py +39 -0
  253. unstructured_ingest/utils/table.py +73 -0
  254. unstructured_ingest/v2/__init__.py +1 -0
  255. unstructured_ingest/v2/cli/__init__.py +0 -0
  256. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  257. unstructured_ingest/v2/cli/base/cmd.py +215 -0
  258. unstructured_ingest/v2/cli/base/dest.py +76 -0
  259. unstructured_ingest/v2/cli/base/importer.py +34 -0
  260. unstructured_ingest/v2/cli/base/src.py +70 -0
  261. unstructured_ingest/v2/cli/cli.py +24 -0
  262. unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
  263. unstructured_ingest/v2/cli/cmds/astra.py +85 -0
  264. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
  265. unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
  266. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
  267. unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
  268. unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  269. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
  270. unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
  271. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
  272. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
  273. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
  274. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
  275. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
  276. unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
  277. unstructured_ingest/v2/cli/cmds/local.py +60 -0
  278. unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
  279. unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
  280. unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
  281. unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
  282. unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
  283. unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
  284. unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
  285. unstructured_ingest/v2/cli/cmds/sql.py +84 -0
  286. unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
  287. unstructured_ingest/v2/cli/configs/__init__.py +6 -0
  288. unstructured_ingest/v2/cli/configs/chunk.py +89 -0
  289. unstructured_ingest/v2/cli/configs/embed.py +74 -0
  290. unstructured_ingest/v2/cli/configs/partition.py +99 -0
  291. unstructured_ingest/v2/cli/configs/processor.py +88 -0
  292. unstructured_ingest/v2/cli/interfaces.py +27 -0
  293. unstructured_ingest/v2/cli/utils.py +240 -0
  294. unstructured_ingest/v2/example.py +37 -0
  295. unstructured_ingest/v2/interfaces/__init__.py +29 -0
  296. unstructured_ingest/v2/interfaces/connector.py +32 -0
  297. unstructured_ingest/v2/interfaces/downloader.py +79 -0
  298. unstructured_ingest/v2/interfaces/file_data.py +49 -0
  299. unstructured_ingest/v2/interfaces/indexer.py +28 -0
  300. unstructured_ingest/v2/interfaces/process.py +20 -0
  301. unstructured_ingest/v2/interfaces/processor.py +48 -0
  302. unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
  303. unstructured_ingest/v2/interfaces/uploader.py +39 -0
  304. unstructured_ingest/v2/logger.py +126 -0
  305. unstructured_ingest/v2/main.py +11 -0
  306. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  307. unstructured_ingest/v2/pipeline/interfaces.py +167 -0
  308. unstructured_ingest/v2/pipeline/pipeline.py +284 -0
  309. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  310. unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
  311. unstructured_ingest/v2/pipeline/steps/download.py +124 -0
  312. unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
  313. unstructured_ingest/v2/pipeline/steps/index.py +61 -0
  314. unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
  315. unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
  316. unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
  317. unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
  318. unstructured_ingest/v2/pipeline/utils.py +15 -0
  319. unstructured_ingest/v2/processes/__init__.py +0 -0
  320. unstructured_ingest/v2/processes/chunker.py +97 -0
  321. unstructured_ingest/v2/processes/connector_registry.py +63 -0
  322. unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
  323. unstructured_ingest/v2/processes/connectors/astra.py +152 -0
  324. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
  325. unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
  326. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
  327. unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
  328. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  329. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
  330. unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
  331. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
  332. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
  333. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
  334. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
  335. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
  336. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  337. unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
  338. unstructured_ingest/v2/processes/connectors/local.py +204 -0
  339. unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
  340. unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
  341. unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
  342. unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
  343. unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
  344. unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
  345. unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
  346. unstructured_ingest/v2/processes/connectors/sql.py +269 -0
  347. unstructured_ingest/v2/processes/connectors/utils.py +19 -0
  348. unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
  349. unstructured_ingest/v2/processes/embedder.py +76 -0
  350. unstructured_ingest/v2/processes/partitioner.py +166 -0
  351. unstructured_ingest/v2/processes/uncompress.py +43 -0
  352. unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
  353. unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
  354. unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
  355. unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
  356. unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,349 @@
1
+ import io
2
+ import json
3
+ import os
4
+ import typing as t
5
+ from dataclasses import dataclass, field
6
+ from datetime import datetime
7
+ from mimetypes import guess_extension
8
+ from pathlib import Path
9
+
10
+ from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
11
+
12
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
13
+ from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
14
+ from unstructured_ingest.interfaces import (
15
+ AccessConfig,
16
+ BaseConnectorConfig,
17
+ BaseSessionHandle,
18
+ BaseSingleIngestDoc,
19
+ BaseSourceConnector,
20
+ ConfigSessionHandleMixin,
21
+ IngestDocCleanupMixin,
22
+ IngestDocSessionHandleMixin,
23
+ SourceConnectorCleanupMixin,
24
+ SourceMetadata,
25
+ )
26
+ from unstructured_ingest.logger import logger
27
+ from unstructured_ingest.utils.dep_check import requires_dependencies
28
+ from unstructured_ingest.utils.string_and_date_utils import json_to_dict
29
+
30
+ if t.TYPE_CHECKING:
31
+ from googleapiclient.discovery import Resource as GoogleAPIResource
32
+ from googleapiclient.http import MediaIoBaseDownload
33
+
34
+ FILE_FORMAT = "{id}-{name}{ext}"
35
+ DIRECTORY_FORMAT = "{id}-{name}"
36
+
37
+
38
+ @dataclass
39
+ class GoogleDriveSessionHandle(BaseSessionHandle):
40
+ service: "GoogleAPIResource"
41
+
42
+
43
+ @requires_dependencies(["googleapiclient"], extras="google-drive")
44
+ def create_service_account_object(key_path: t.Union[str, dict], id=None):
45
+ """
46
+ Creates a service object for interacting with Google Drive.
47
+
48
+ Providing a drive id enforces a key validation process.
49
+
50
+ Args:
51
+ key_path: Path to Google Drive service account json file. (or the actual json)
52
+ id: ID of a file on Google Drive. File has to be either publicly accessible or accessible
53
+ to the service account.
54
+
55
+ Returns:
56
+ Service account object
57
+ """
58
+ from google.auth import default, exceptions
59
+ from google.oauth2 import service_account
60
+ from googleapiclient.discovery import build
61
+ from googleapiclient.errors import HttpError
62
+
63
+ # Service account key can be a dict or a file path(str)
64
+ # But the dict may come in as a string
65
+ key_path = json_to_dict(key_path)
66
+
67
+ try:
68
+ if isinstance(key_path, dict):
69
+ creds = service_account.Credentials.from_service_account_info(key_path)
70
+ elif isinstance(key_path, str):
71
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path
72
+ creds, _ = default()
73
+ else:
74
+ raise ValueError(
75
+ f"key path not recognized as a dictionary or a file path: "
76
+ f"[{type(key_path)}] {key_path}",
77
+ )
78
+ service = build("drive", "v3", credentials=creds)
79
+
80
+ if id:
81
+ service.files().list(
82
+ spaces="drive",
83
+ fields="files(id)",
84
+ pageToken=None,
85
+ corpora="user",
86
+ q=f"'{id}' in parents",
87
+ ).execute()
88
+
89
+ except HttpError as exc:
90
+ raise ValueError(f"{exc.reason}")
91
+ except exceptions.DefaultCredentialsError:
92
+ raise ValueError("The provided API key is invalid.")
93
+
94
+ return service
95
+
96
+
97
+ @dataclass
98
+ class GoogleDriveAccessConfig(AccessConfig):
99
+ service_account_key: t.Union[str, dict] = enhanced_field(sensitive=True)
100
+
101
+
102
+ @dataclass
103
+ class SimpleGoogleDriveConfig(ConfigSessionHandleMixin, BaseConnectorConfig):
104
+ """Connector config where drive_id is the id of the document to process or
105
+ the folder to process all documents from."""
106
+
107
+ # Google Drive Specific Options
108
+ drive_id: str
109
+ access_config: GoogleDriveAccessConfig
110
+ extension: t.Optional[str] = None
111
+ recursive: bool = False
112
+
113
+ def create_session_handle(
114
+ self,
115
+ ) -> GoogleDriveSessionHandle:
116
+ service = create_service_account_object(self.access_config.service_account_key)
117
+ return GoogleDriveSessionHandle(service=service)
118
+
119
+
120
+ @dataclass
121
+ class GoogleDriveIngestDoc(IngestDocSessionHandleMixin, IngestDocCleanupMixin, BaseSingleIngestDoc):
122
+ connector_config: SimpleGoogleDriveConfig
123
+ meta: t.Dict[str, str] = field(default_factory=dict)
124
+ registry_name: str = "google_drive"
125
+
126
+ @property
127
+ def filename(self):
128
+ return Path(self.meta.get("download_filepath")).resolve() # type: ignore
129
+
130
+ @property
131
+ def _output_filename(self):
132
+ return Path(f"{self.meta.get('output_filepath')}.json").resolve()
133
+
134
+ @property
135
+ def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
136
+ return {
137
+ "drive_id": self.connector_config.drive_id,
138
+ "file_id": self.meta["id"],
139
+ }
140
+
141
+ @requires_dependencies(["googleapiclient"], extras="google-drive")
142
+ def update_source_metadata(self):
143
+ from googleapiclient.errors import HttpError
144
+
145
+ try:
146
+ file_obj = (
147
+ self.session_handle.service.files()
148
+ .get(
149
+ fileId=self.meta["id"],
150
+ fields="id, createdTime, modifiedTime, version, webContentLink",
151
+ )
152
+ .execute()
153
+ )
154
+ except HttpError as e:
155
+ if e.status_code == 404:
156
+ logger.error(f"File {self.meta['name']} not found")
157
+ self.source_metadata = SourceMetadata(
158
+ exists=True,
159
+ )
160
+ return
161
+ raise
162
+
163
+ date_created = None
164
+ if dc := file_obj.get("createdTime", ""):
165
+ date_created = datetime.strptime(
166
+ dc,
167
+ "%Y-%m-%dT%H:%M:%S.%fZ",
168
+ ).isoformat()
169
+
170
+ date_modified = None
171
+ if dm := file_obj.get("modifiedTime", ""):
172
+ date_modified = datetime.strptime(
173
+ dm,
174
+ "%Y-%m-%dT%H:%M:%S.%fZ",
175
+ ).isoformat()
176
+
177
+ self.source_metadata = SourceMetadata(
178
+ date_created=date_created,
179
+ date_modified=date_modified,
180
+ version=file_obj.get("version", ""),
181
+ source_url=file_obj.get("webContentLink", ""),
182
+ exists=True,
183
+ )
184
+
185
+ @SourceConnectionNetworkError.wrap
186
+ def _run_downloader(self, downloader: "MediaIoBaseDownload") -> bool:
187
+ downloaded = False
188
+ while downloaded is False:
189
+ _, downloaded = downloader.next_chunk()
190
+ return downloaded
191
+
192
+ @requires_dependencies(["googleapiclient"], extras="google-drive")
193
+ @SourceConnectionError.wrap
194
+ @BaseSingleIngestDoc.skip_if_file_exists
195
+ def get_file(self):
196
+ from googleapiclient.http import MediaIoBaseDownload
197
+
198
+ if self.meta.get("mimeType", "").startswith("application/vnd.google-apps"):
199
+ export_mime = GOOGLE_DRIVE_EXPORT_TYPES.get(
200
+ self.meta.get("mimeType"), # type: ignore
201
+ )
202
+ if not export_mime:
203
+ logger.info(
204
+ f"File not supported. Name: {self.meta.get('name')} "
205
+ f"ID: {self.meta.get('id')} "
206
+ f"MimeType: {self.meta.get('mimeType')}",
207
+ )
208
+ return
209
+
210
+ request = self.session_handle.service.files().export_media(
211
+ fileId=self.meta.get("id"),
212
+ mimeType=export_mime,
213
+ )
214
+ else:
215
+ request = self.session_handle.service.files().get_media(fileId=self.meta.get("id"))
216
+ file = io.BytesIO()
217
+ downloader = MediaIoBaseDownload(file, request)
218
+ self.update_source_metadata()
219
+ downloaded = self._run_downloader(downloader=downloader)
220
+
221
+ saved = False
222
+ if downloaded and file:
223
+ dir_ = Path(self.meta["download_dir"])
224
+ if dir_:
225
+ if not dir_.is_dir():
226
+ logger.debug(f"Creating directory: {self.meta.get('download_dir')}")
227
+
228
+ if dir_:
229
+ dir_.mkdir(parents=True, exist_ok=True)
230
+
231
+ with open(self.filename, "wb") as handler:
232
+ handler.write(file.getbuffer())
233
+ saved = True
234
+ logger.debug(f"File downloaded: {self.filename}.")
235
+ if not saved:
236
+ logger.error(f"Error while downloading and saving file: {self.filename}.")
237
+
238
+ def write_result(self):
239
+ """Write the structured json result for this doc. result must be json serializable."""
240
+ if self.read_config.download_only:
241
+ return
242
+ self._output_filename.parent.mkdir(parents=True, exist_ok=True)
243
+ with open(self._output_filename, "w") as output_f:
244
+ output_f.write(json.dumps(self.isd_elems_no_filename, ensure_ascii=False, indent=2))
245
+ logger.info(f"Wrote {self._output_filename}")
246
+
247
+
248
+ @dataclass
249
+ class GoogleDriveSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
250
+ """Objects of this class support fetching documents from Google Drive"""
251
+
252
+ connector_config: SimpleGoogleDriveConfig
253
+
254
+ def _list_objects(self, drive_id, recursive=False):
255
+ files = []
256
+ service = self.connector_config.create_session_handle().service
257
+
258
+ def traverse(drive_id, download_dir, output_dir, recursive=False):
259
+ page_token = None
260
+ while True:
261
+ response = (
262
+ service.files()
263
+ .list(
264
+ spaces="drive",
265
+ fields="nextPageToken, files(id, name, mimeType)",
266
+ pageToken=page_token,
267
+ corpora="user",
268
+ q=f"'{drive_id}' in parents",
269
+ )
270
+ .execute()
271
+ )
272
+
273
+ for meta in response.get("files", []):
274
+ if meta.get("mimeType") == "application/vnd.google-apps.folder":
275
+ dir_ = DIRECTORY_FORMAT.format(name=meta.get("name"), id=meta.get("id"))
276
+ if recursive:
277
+ download_sub_dir = (download_dir / dir_).resolve()
278
+ output_sub_dir = (output_dir / dir_).resolve()
279
+ traverse(meta.get("id"), download_sub_dir, output_sub_dir, True)
280
+ else:
281
+ ext = ""
282
+ if not Path(meta.get("name")).suffixes:
283
+ guess = guess_extension(meta.get("mimeType"))
284
+ ext = guess if guess else ext
285
+
286
+ if meta.get("mimeType", "").startswith("application/vnd.google-apps"):
287
+ export_mime = GOOGLE_DRIVE_EXPORT_TYPES.get(meta.get("mimeType"))
288
+ if not export_mime:
289
+ logger.info(
290
+ f"File {meta.get('name')} has an "
291
+ f"unsupported MimeType {meta.get('mimeType')}",
292
+ )
293
+ continue
294
+
295
+ if not ext:
296
+ guess = guess_extension(export_mime)
297
+ ext = guess if guess else ext
298
+
299
+ # TODO (Habeeb): Consider filtering at the query level.
300
+ if (
301
+ self.connector_config.extension
302
+ and self.connector_config.extension != ext
303
+ ): # noqa: SIM102
304
+ logger.debug(
305
+ f"File {meta.get('name')} does not match "
306
+ f"the file type {self.connector_config.extension}",
307
+ )
308
+ continue
309
+
310
+ name = FILE_FORMAT.format(name=meta.get("name"), id=meta.get("id"), ext=ext)
311
+ meta["download_dir"] = str(download_dir)
312
+ meta["download_filepath"] = (download_dir / name).resolve().as_posix()
313
+ meta["output_dir"] = str(output_dir)
314
+ meta["output_filepath"] = (output_dir / name).resolve().as_posix()
315
+ files.append(meta)
316
+
317
+ page_token = response.get("nextPageToken", None)
318
+ if page_token is None:
319
+ break
320
+
321
+ traverse(
322
+ drive_id,
323
+ Path(self.read_config.download_dir),
324
+ Path(self.processor_config.output_dir),
325
+ recursive,
326
+ )
327
+ return files
328
+
329
+ def initialize(self):
330
+ pass
331
+
332
+ def check_connection(self):
333
+ try:
334
+ self.connector_config.create_session_handle().service
335
+ except Exception as e:
336
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
337
+ raise SourceConnectionError(f"failed to validate connection: {e}")
338
+
339
+ def get_ingest_docs(self):
340
+ files = self._list_objects(self.connector_config.drive_id, self.connector_config.recursive)
341
+ return [
342
+ GoogleDriveIngestDoc(
343
+ connector_config=self.connector_config,
344
+ processor_config=self.processor_config,
345
+ read_config=self.read_config,
346
+ meta=file,
347
+ )
348
+ for file in files
349
+ ]
@@ -0,0 +1,278 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+ from enum import Enum
4
+ from functools import reduce
5
+ from pathlib import Path
6
+
7
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
8
+ from unstructured_ingest.error import SourceConnectionError
9
+ from unstructured_ingest.interfaces import (
10
+ AccessConfig,
11
+ BaseConnectorConfig,
12
+ BaseSessionHandle,
13
+ BaseSingleIngestDoc,
14
+ BaseSourceConnector,
15
+ ConfigSessionHandleMixin,
16
+ IngestDocCleanupMixin,
17
+ IngestDocSessionHandleMixin,
18
+ SourceConnectorCleanupMixin,
19
+ SourceMetadata,
20
+ )
21
+ from unstructured_ingest.logger import logger
22
+ from unstructured_ingest.utils.dep_check import requires_dependencies
23
+
24
+ if t.TYPE_CHECKING:
25
+ from hubspot import HubSpot
26
+
27
+ CONTENT_TAG = "content"
28
+
29
+
30
+ class HubSpotObjectTypes(Enum):
31
+ CALLS = "calls"
32
+ COMMUNICATIONS = "communications"
33
+ EMAILS = "emails"
34
+ NOTES = "notes"
35
+ PRODUCTS = "products"
36
+ TICKETS = "tickets"
37
+
38
+
39
+ @dataclass
40
+ class HubSpotSessionHandle(BaseSessionHandle):
41
+ service: "HubSpot"
42
+
43
+
44
+ @dataclass
45
+ class HubSpotAccessConfig(AccessConfig):
46
+ api_token: str = enhanced_field(repr=False, sensitive=True)
47
+
48
+
49
+ @dataclass
50
+ class SimpleHubSpotConfig(ConfigSessionHandleMixin, BaseConnectorConfig):
51
+ access_config: HubSpotAccessConfig
52
+ params: t.Optional[str] = None
53
+ properties: t.Optional[dict] = None
54
+ object_types: t.Optional[t.List[str]] = None
55
+ custom_properties: t.Optional[t.Dict[str, t.List[str]]] = None
56
+
57
+ @requires_dependencies(["hubspot"], extras="hubspot")
58
+ def create_session_handle(self) -> HubSpotSessionHandle:
59
+ from hubspot import HubSpot
60
+
61
+ service = HubSpot(access_token=self.access_config.api_token)
62
+ return HubSpotSessionHandle(service=service)
63
+
64
+
65
+ @dataclass
66
+ class HubSpotIngestDoc(IngestDocSessionHandleMixin, IngestDocCleanupMixin, BaseSingleIngestDoc):
67
+ connector_config: SimpleHubSpotConfig
68
+ object_id: str
69
+ object_type: str
70
+ content_properties: t.List[str]
71
+ registry_name: str = "hubspot"
72
+
73
+ def __post_init__(self):
74
+ self._add_custom_properties()
75
+
76
+ @property
77
+ def filename(self):
78
+ return (
79
+ Path(self.read_config.download_dir)
80
+ / f"{self.object_type}/{self.object_id}.txt" # type: ignore
81
+ ).resolve()
82
+
83
+ @property
84
+ def _output_filename(self):
85
+ return (
86
+ Path(self.processor_config.output_dir)
87
+ / f"{self.object_type}/{self.object_id}.json" # type: ignore
88
+ ).resolve()
89
+
90
+ @property
91
+ def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
92
+ return {
93
+ f"{self.registry_name}_id": self.object_id,
94
+ }
95
+
96
+ @property
97
+ def version(self) -> t.Optional[str]:
98
+ return None
99
+
100
+ @property
101
+ def source_url(self) -> t.Optional[str]:
102
+ return None
103
+
104
+ def _add_custom_properties(self):
105
+ if (self.connector_config.custom_properties is not None) and (
106
+ (cprops := self.connector_config.custom_properties.get(self.object_type)) is not None
107
+ ):
108
+ self.content_properties += cprops
109
+
110
+ def _join_object_properties(self, obj) -> str:
111
+ return "\n".join(
112
+ [
113
+ obj.properties[cprop]
114
+ for cprop in self.content_properties
115
+ if (obj.properties.get(cprop) is not None)
116
+ ],
117
+ )
118
+
119
+ def _resolve_getter(self):
120
+ method_path = ""
121
+ if self.object_type in [
122
+ HubSpotObjectTypes.CALLS.value,
123
+ HubSpotObjectTypes.COMMUNICATIONS.value,
124
+ HubSpotObjectTypes.EMAILS.value,
125
+ HubSpotObjectTypes.NOTES.value,
126
+ ]:
127
+ method_path = f"crm.objects.{self.object_type}.basic_api.get_by_id"
128
+ if self.object_type in [
129
+ HubSpotObjectTypes.PRODUCTS.value,
130
+ HubSpotObjectTypes.TICKETS.value,
131
+ ]:
132
+ method_path = f"crm.{self.object_type}.basic_api.get_by_id"
133
+
134
+ method = reduce(getattr, method_path.split("."), self.session_handle.service)
135
+ return method
136
+
137
+ @requires_dependencies(["hubspot"], extras="hubspot")
138
+ def _fetch_obj(self, check_only=False):
139
+ from hubspot.crm.objects.exceptions import NotFoundException
140
+
141
+ get_by_id_method = self._resolve_getter()
142
+ try:
143
+ response = get_by_id_method(
144
+ self.object_id,
145
+ properties=([] if check_only else self.content_properties),
146
+ )
147
+ except NotFoundException as e:
148
+ logger.error(e)
149
+ return None
150
+ return response
151
+
152
+ def update_source_metadata(self, **kwargs) -> None:
153
+ obj = kwargs.get("object", self._fetch_obj(check_only=True)) # type: ignore
154
+ if obj is None:
155
+ self.source_metadata = SourceMetadata(
156
+ exists=False,
157
+ )
158
+ return
159
+ self.source_metadata = SourceMetadata(
160
+ date_created=obj.created_at.isoformat(),
161
+ date_modified=obj.updated_at.isoformat(),
162
+ exists=True,
163
+ )
164
+
165
+ @SourceConnectionError.wrap
166
+ @BaseSingleIngestDoc.skip_if_file_exists
167
+ def get_file(self):
168
+ obj = self._fetch_obj()
169
+ if obj is None:
170
+ raise ValueError(
171
+ f"Failed to retrieve object {self.registry_name}",
172
+ f"with ID {self.object_id}",
173
+ )
174
+ self.update_source_metadata(object=obj)
175
+ output = self._join_object_properties(obj)
176
+ self.filename.parent.mkdir(parents=True, exist_ok=True)
177
+ with open(self.filename, "w", encoding="utf8") as f:
178
+ f.write(output)
179
+ return
180
+
181
+
182
+ @dataclass
183
+ class HubSpotSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
184
+ connector_config: SimpleHubSpotConfig
185
+
186
+ def initialize(self):
187
+ self.hubspot = self.connector_config.create_session_handle().service
188
+
189
+ def check_connection(self):
190
+ return self.connector_config.create_session_handle().service
191
+
192
+ @requires_dependencies(["hubspot"], extras="hubspot")
193
+ def _list_objects(self, get_page_method, object_type: str, content_properties: t.List[str]):
194
+ try:
195
+ objects = get_page_method()
196
+ except Exception as e:
197
+ logger.error(e)
198
+ logger.error(
199
+ f"Failed to retrieve {object_type}, omitting processing...",
200
+ )
201
+ return []
202
+ return [
203
+ HubSpotIngestDoc(
204
+ connector_config=self.connector_config,
205
+ processor_config=self.processor_config,
206
+ read_config=self.read_config,
207
+ object_id=obj.id,
208
+ object_type=object_type,
209
+ content_properties=content_properties,
210
+ )
211
+ for obj in objects.results
212
+ ]
213
+
214
+ def _get_calls(self) -> t.List[HubSpotIngestDoc]:
215
+ return self._list_objects(
216
+ self.hubspot.crm.objects.calls.basic_api.get_page,
217
+ HubSpotObjectTypes.CALLS.value,
218
+ ["hs_call_title", "hs_call_body"],
219
+ )
220
+
221
+ def _get_communications(self) -> t.List[HubSpotIngestDoc]:
222
+ return self._list_objects(
223
+ self.hubspot.crm.objects.communications.basic_api.get_page,
224
+ HubSpotObjectTypes.COMMUNICATIONS.value,
225
+ ["hs_communication_body"],
226
+ )
227
+
228
+ def _get_emails(self) -> t.List[HubSpotIngestDoc]:
229
+ return self._list_objects(
230
+ self.hubspot.crm.objects.emails.basic_api.get_page,
231
+ HubSpotObjectTypes.EMAILS.value,
232
+ ["hs_email_subject", "hs_email_text"],
233
+ )
234
+
235
+ def _get_notes(self) -> t.List[HubSpotIngestDoc]:
236
+ return self._list_objects(
237
+ self.hubspot.crm.objects.notes.basic_api.get_page,
238
+ HubSpotObjectTypes.NOTES.value,
239
+ ["hs_note_body"],
240
+ )
241
+
242
+ def _get_products(self) -> t.List[HubSpotIngestDoc]:
243
+ return self._list_objects(
244
+ self.hubspot.crm.products.basic_api.get_page,
245
+ HubSpotObjectTypes.PRODUCTS.value,
246
+ ["description"],
247
+ )
248
+
249
+ def _get_tickets(self) -> t.List[HubSpotIngestDoc]:
250
+ return self._list_objects(
251
+ self.hubspot.crm.tickets.basic_api.get_page,
252
+ HubSpotObjectTypes.TICKETS.value,
253
+ ["subject", "content"],
254
+ )
255
+
256
+ def get_ingest_docs(self):
257
+ obj_method_resolver = {
258
+ HubSpotObjectTypes.CALLS.value: self._get_calls,
259
+ HubSpotObjectTypes.COMMUNICATIONS.value: self._get_communications,
260
+ HubSpotObjectTypes.EMAILS.value: self._get_emails,
261
+ HubSpotObjectTypes.NOTES.value: self._get_notes,
262
+ HubSpotObjectTypes.PRODUCTS.value: self._get_products,
263
+ HubSpotObjectTypes.TICKETS.value: self._get_tickets,
264
+ }
265
+
266
+ if self.connector_config.object_types is not None:
267
+ obj_method_resolver = {
268
+ obj_name: obj_method_resolver.get(obj_name) # type: ignore
269
+ for obj_name in self.connector_config.object_types
270
+ }
271
+
272
+ ingest_docs: t.List[HubSpotIngestDoc] = []
273
+ for obj_name, obj_method in obj_method_resolver.items():
274
+ logger.info(f"Retrieving - {obj_name}")
275
+ results: t.List[HubSpotIngestDoc] = obj_method() # type: ignore
276
+ ingest_docs += results # type: ignore
277
+
278
+ return ingest_docs