unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (125) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cli.py +6 -1
  3. unstructured_ingest/cli/cmds/__init__.py +4 -4
  4. unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
  5. unstructured_ingest/cli/interfaces.py +13 -6
  6. unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
  7. unstructured_ingest/connector/biomed.py +12 -5
  8. unstructured_ingest/connector/confluence.py +3 -3
  9. unstructured_ingest/connector/github.py +3 -2
  10. unstructured_ingest/connector/google_drive.py +1 -2
  11. unstructured_ingest/connector/mongodb.py +1 -2
  12. unstructured_ingest/connector/notion/client.py +31 -16
  13. unstructured_ingest/connector/notion/connector.py +3 -2
  14. unstructured_ingest/connector/registry.py +2 -2
  15. unstructured_ingest/connector/vectara.py +7 -2
  16. unstructured_ingest/interfaces.py +13 -9
  17. unstructured_ingest/pipeline/interfaces.py +8 -3
  18. unstructured_ingest/pipeline/reformat/chunking.py +13 -9
  19. unstructured_ingest/pipeline/reformat/embedding.py +3 -3
  20. unstructured_ingest/runner/__init__.py +2 -2
  21. unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
  22. unstructured_ingest/runner/writers/__init__.py +2 -2
  23. unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
  24. unstructured_ingest/utils/chunking.py +45 -0
  25. unstructured_ingest/utils/dep_check.py +1 -1
  26. unstructured_ingest/utils/google_filetype.py +9 -0
  27. unstructured_ingest/v2/cli/base/cmd.py +57 -13
  28. unstructured_ingest/v2/cli/base/dest.py +21 -12
  29. unstructured_ingest/v2/cli/base/src.py +35 -23
  30. unstructured_ingest/v2/cli/cmds.py +14 -0
  31. unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
  32. unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
  33. unstructured_ingest/v2/interfaces/connector.py +5 -7
  34. unstructured_ingest/v2/interfaces/downloader.py +8 -5
  35. unstructured_ingest/v2/interfaces/file_data.py +8 -2
  36. unstructured_ingest/v2/interfaces/indexer.py +3 -4
  37. unstructured_ingest/v2/interfaces/processor.py +10 -10
  38. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  39. unstructured_ingest/v2/interfaces/uploader.py +3 -3
  40. unstructured_ingest/v2/pipeline/pipeline.py +9 -6
  41. unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
  42. unstructured_ingest/v2/pipeline/steps/download.py +13 -11
  43. unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
  44. unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
  45. unstructured_ingest/v2/pipeline/steps/index.py +14 -10
  46. unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
  47. unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
  48. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
  49. unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
  50. unstructured_ingest/v2/processes/__init__.py +18 -0
  51. unstructured_ingest/v2/processes/chunker.py +74 -28
  52. unstructured_ingest/v2/processes/connector_registry.py +8 -2
  53. unstructured_ingest/v2/processes/connectors/__init__.py +18 -3
  54. unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +46 -39
  55. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
  56. unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
  57. unstructured_ingest/v2/processes/connectors/couchbase.py +333 -0
  58. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
  59. unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
  60. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
  61. unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
  62. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
  63. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
  64. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
  65. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
  66. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
  67. unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
  68. unstructured_ingest/v2/processes/connectors/kdbai.py +170 -0
  69. unstructured_ingest/v2/processes/connectors/local.py +27 -16
  70. unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
  71. unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
  72. unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
  73. unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
  74. unstructured_ingest/v2/processes/connectors/pinecone.py +22 -21
  75. unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
  76. unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
  77. unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
  78. unstructured_ingest/v2/processes/connectors/sql.py +29 -31
  79. unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
  80. unstructured_ingest/v2/processes/embedder.py +106 -47
  81. unstructured_ingest/v2/processes/filter.py +11 -5
  82. unstructured_ingest/v2/processes/partitioner.py +79 -33
  83. unstructured_ingest/v2/processes/uncompress.py +3 -3
  84. unstructured_ingest/v2/utils.py +45 -0
  85. unstructured_ingest-0.0.5.dist-info/LICENSE.md +201 -0
  86. unstructured_ingest-0.0.5.dist-info/METADATA +574 -0
  87. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/RECORD +91 -116
  88. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/WHEEL +1 -1
  89. unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
  90. unstructured_ingest/v2/cli/cmds/astra.py +0 -85
  91. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
  92. unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
  93. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
  94. unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
  95. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
  96. unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
  97. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
  98. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
  99. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
  100. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
  101. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
  102. unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
  103. unstructured_ingest/v2/cli/cmds/local.py +0 -52
  104. unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
  105. unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
  106. unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
  107. unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
  108. unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
  109. unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
  110. unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
  111. unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
  112. unstructured_ingest/v2/cli/cmds/sql.py +0 -84
  113. unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
  114. unstructured_ingest/v2/cli/configs/__init__.py +0 -13
  115. unstructured_ingest/v2/cli/configs/chunk.py +0 -89
  116. unstructured_ingest/v2/cli/configs/embed.py +0 -74
  117. unstructured_ingest/v2/cli/configs/filter.py +0 -28
  118. unstructured_ingest/v2/cli/configs/partition.py +0 -99
  119. unstructured_ingest/v2/cli/configs/processor.py +0 -88
  120. unstructured_ingest/v2/cli/interfaces.py +0 -27
  121. unstructured_ingest/v2/pipeline/utils.py +0 -15
  122. unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
  123. /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  124. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/entry_points.txt +0 -0
  125. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/top_level.txt +0 -0
@@ -1,81 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
6
- from unstructured_ingest.v2.cli.cmds.fsspec.fsspec import (
7
- FsspecCliDownloadConfig,
8
- FsspecCliIndexerConfig,
9
- FsspecCliUploaderConfig,
10
- )
11
- from unstructured_ingest.v2.cli.interfaces import CliConfig
12
- from unstructured_ingest.v2.cli.utils import FileOrJson
13
- from unstructured_ingest.v2.processes.connectors.fsspec.gcs import (
14
- CONNECTOR_TYPE,
15
- )
16
-
17
-
18
- @dataclass
19
- class GcsCliDownloadConfig(FsspecCliDownloadConfig):
20
- pass
21
-
22
-
23
- @dataclass
24
- class GcsCliIndexerConfig(FsspecCliIndexerConfig):
25
- pass
26
-
27
-
28
- @dataclass
29
- class GcsCliConnectionConfig(CliConfig):
30
- @staticmethod
31
- def get_cli_options() -> list[click.Option]:
32
- help_string = """
33
- Options:
34
- - ``None``, GCSFS will attempt to guess your credentials in the
35
- following order: gcloud CLI default, gcsfs cached token, google compute
36
- metadata service, anonymous.
37
- - ``'google_default'``, your default gcloud credentials will be used,
38
- which are typically established by doing ``gcloud login`` in a terminal.
39
- - ``'cache'``, credentials from previously successful gcsfs
40
- authentication will be used (use this after "browser" auth succeeded)
41
- - ``'anon'``, no authentication is performed, and you can only
42
- access data which is accessible to allUsers (in this case, the project and
43
- access level parameters are meaningless)
44
- - ``'browser'``, you get an access code with which you can
45
- authenticate via a specially provided URL
46
- - if ``'cloud'``, we assume we are running within google compute
47
- or google container engine, and query the internal metadata directly for
48
- a token.
49
- - you may supply a token generated by the
50
- [gcloud](https://cloud.google.com/sdk/docs/)
51
- utility; this is either a python dictionary or the name of a file
52
- containing the JSON returned by logging in with the gcloud CLI tool.
53
- """
54
- options = [
55
- click.Option(
56
- ["--service-account-key"],
57
- default=None,
58
- type=FileOrJson(allow_raw_str=True),
59
- help=help_string,
60
- ),
61
- ]
62
- return options
63
-
64
-
65
- @dataclass
66
- class GcsUploaderConfig(FsspecCliUploaderConfig):
67
- pass
68
-
69
-
70
- gcs_src_cmd = SrcCmd(
71
- cmd_name=CONNECTOR_TYPE,
72
- indexer_config=GcsCliIndexerConfig,
73
- connection_config=GcsCliConnectionConfig,
74
- downloader_config=GcsCliDownloadConfig,
75
- )
76
-
77
- gcs_dest_cmd = DestCmd(
78
- cmd_name=CONNECTOR_TYPE,
79
- connection_config=GcsCliConnectionConfig,
80
- uploader_config=GcsUploaderConfig,
81
- )
@@ -1,84 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
6
- from unstructured_ingest.v2.cli.cmds.fsspec.fsspec import (
7
- FsspecCliDownloadConfig,
8
- FsspecCliIndexerConfig,
9
- FsspecCliUploaderConfig,
10
- )
11
- from unstructured_ingest.v2.cli.interfaces import CliConfig
12
- from unstructured_ingest.v2.processes.connectors.fsspec.s3 import (
13
- CONNECTOR_TYPE,
14
- )
15
-
16
-
17
- @dataclass
18
- class S3CliDownloadConfig(FsspecCliDownloadConfig):
19
- pass
20
-
21
-
22
- @dataclass
23
- class S3CliIndexerConfig(FsspecCliIndexerConfig):
24
- pass
25
-
26
-
27
- @dataclass
28
- class S3CliConnectionConfig(CliConfig):
29
- @staticmethod
30
- def get_cli_options() -> list[click.Option]:
31
- options = [
32
- click.Option(
33
- ["--anonymous"],
34
- is_flag=True,
35
- default=False,
36
- help="Connect to s3 without local AWS credentials.",
37
- ),
38
- click.Option(
39
- ["--endpoint-url"],
40
- type=str,
41
- default=None,
42
- help="Use this endpoint_url, if specified. Needed for "
43
- "connecting to non-AWS S3 buckets.",
44
- ),
45
- click.Option(
46
- ["--key"],
47
- type=str,
48
- default=None,
49
- help="If not anonymous, use this access key ID, if specified. Takes precedence "
50
- "over `aws_access_key_id` in client_kwargs.",
51
- ),
52
- click.Option(
53
- ["--secret"],
54
- type=str,
55
- default=None,
56
- help="If not anonymous, use this secret access key, if specified.",
57
- ),
58
- click.Option(
59
- ["--token"],
60
- type=str,
61
- default=None,
62
- help="If not anonymous, use this security token, if specified.",
63
- ),
64
- ]
65
- return options
66
-
67
-
68
- @dataclass
69
- class S3UploaderConfig(FsspecCliUploaderConfig):
70
- pass
71
-
72
-
73
- s3_src_cmd = SrcCmd(
74
- cmd_name=CONNECTOR_TYPE,
75
- indexer_config=S3CliIndexerConfig,
76
- connection_config=S3CliConnectionConfig,
77
- downloader_config=S3CliDownloadConfig,
78
- )
79
-
80
- s3_dest_cmd = DestCmd(
81
- cmd_name=CONNECTOR_TYPE,
82
- connection_config=S3CliConnectionConfig,
83
- uploader_config=S3UploaderConfig,
84
- )
@@ -1,80 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
6
- from unstructured_ingest.v2.cli.cmds.fsspec.fsspec import (
7
- FsspecCliDownloadConfig,
8
- FsspecCliIndexerConfig,
9
- FsspecCliUploaderConfig,
10
- )
11
- from unstructured_ingest.v2.cli.interfaces import CliConfig
12
- from unstructured_ingest.v2.processes.connectors.fsspec.sftp import (
13
- CONNECTOR_TYPE,
14
- )
15
-
16
-
17
- @dataclass
18
- class SftpCliDownloadConfig(FsspecCliDownloadConfig):
19
- pass
20
-
21
-
22
- @dataclass
23
- class SftpCliIndexerConfig(FsspecCliIndexerConfig):
24
- pass
25
-
26
-
27
- @dataclass
28
- class SftpCliConnectionConfig(CliConfig):
29
- @staticmethod
30
- def get_cli_options() -> list[click.Option]:
31
- options = [
32
- click.Option(
33
- ["--username"],
34
- required=True,
35
- type=str,
36
- help="Username for sftp connection",
37
- ),
38
- click.Option(
39
- ["--password"],
40
- required=True,
41
- type=str,
42
- help="Password for sftp connection",
43
- ),
44
- click.Option(
45
- ["--look-for-keys"],
46
- required=False,
47
- default=False,
48
- is_flag=True,
49
- type=bool,
50
- help="Whether to search for private key files in ~/.ssh/",
51
- ),
52
- click.Option(
53
- ["--allow-agent"],
54
- required=False,
55
- default=False,
56
- is_flag=True,
57
- type=bool,
58
- help="Whether to connect to the SSH agent.",
59
- ),
60
- ]
61
- return options
62
-
63
-
64
- @dataclass
65
- class SftpUploaderConfig(FsspecCliUploaderConfig):
66
- pass
67
-
68
-
69
- sftp_src_cmd = SrcCmd(
70
- cmd_name=CONNECTOR_TYPE,
71
- indexer_config=SftpCliIndexerConfig,
72
- connection_config=SftpCliConnectionConfig,
73
- downloader_config=SftpCliDownloadConfig,
74
- )
75
-
76
- sftp_dest_cmd = DestCmd(
77
- cmd_name=CONNECTOR_TYPE,
78
- connection_config=SftpCliConnectionConfig,
79
- uploader_config=SftpUploaderConfig,
80
- )
@@ -1,74 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import SrcCmd
6
- from unstructured_ingest.v2.cli.interfaces import CliConfig
7
- from unstructured_ingest.v2.cli.utils import DelimitedString, FileOrJson
8
- from unstructured_ingest.v2.processes.connectors.google_drive import CONNECTOR_TYPE
9
-
10
-
11
- @dataclass
12
- class GoogleDriveCliConnectionConfig(CliConfig):
13
- @staticmethod
14
- def get_cli_options() -> list[click.Option]:
15
- options = [
16
- click.Option(
17
- ["--drive-id"],
18
- required=True,
19
- type=str,
20
- help="Google Drive File or Folder ID.",
21
- ),
22
- click.Option(
23
- ["--service-account-key"],
24
- required=True,
25
- type=FileOrJson(),
26
- help="Either the file path of the credentials file to use or a json string of "
27
- "those values to use for authentication",
28
- ),
29
- ]
30
- return options
31
-
32
-
33
- @dataclass
34
- class GoogleDriveCliIndexerConfig(CliConfig):
35
- @staticmethod
36
- def get_cli_options() -> list[click.Option]:
37
- options = [
38
- click.Option(
39
- ["--extensions"],
40
- default=None,
41
- type=DelimitedString(),
42
- help="Filters the files to be processed based on extension e.g. jpg, docx, etc.",
43
- ),
44
- click.Option(
45
- ["--recursive"],
46
- is_flag=True,
47
- default=False,
48
- help="Recursively download files in their respective folders "
49
- "otherwise stop at the files in provided folder level.",
50
- ),
51
- ]
52
- return options
53
-
54
-
55
- @dataclass
56
- class GoogleDriveCliDownloadConfig(CliConfig):
57
- @staticmethod
58
- def get_cli_options() -> list[click.Option]:
59
- options = [
60
- click.Option(
61
- ["--download-dir"],
62
- help="Where files are downloaded to, defaults to a location at"
63
- "`$HOME/.cache/unstructured_ingest/<connector name>/<SHA256>`.",
64
- ),
65
- ]
66
- return options
67
-
68
-
69
- google_drive_src_cmd = SrcCmd(
70
- cmd_name=CONNECTOR_TYPE,
71
- connection_config=GoogleDriveCliConnectionConfig,
72
- indexer_config=GoogleDriveCliIndexerConfig,
73
- downloader_config=GoogleDriveCliDownloadConfig,
74
- )
@@ -1,52 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
6
- from unstructured_ingest.v2.cli.interfaces import CliConfig
7
- from unstructured_ingest.v2.processes.connectors.local import CONNECTOR_TYPE
8
-
9
-
10
- @dataclass
11
- class LocalCliIndexerConfig(CliConfig):
12
- @staticmethod
13
- def get_cli_options() -> list[click.Option]:
14
- options = [
15
- click.Option(
16
- ["--input-path"],
17
- required=True,
18
- type=click.Path(file_okay=True, dir_okay=True, exists=True),
19
- help="Path to the location in the local file system that will be processed.",
20
- ),
21
- click.Option(
22
- ["--recursive"],
23
- is_flag=True,
24
- default=False,
25
- help="Recursively download files in their respective folders "
26
- "otherwise stop at the files in provided folder level.",
27
- ),
28
- ]
29
- return options
30
-
31
-
32
- @dataclass
33
- class LocalCliUploaderConfig(CliConfig):
34
- @staticmethod
35
- def get_cli_options() -> list[click.Option]:
36
- options = [
37
- click.Option(
38
- ["--output-dir"],
39
- required=True,
40
- type=str,
41
- help="Local path to write partitioned output to",
42
- )
43
- ]
44
- return options
45
-
46
-
47
- local_src_cmd = SrcCmd(
48
- cmd_name=CONNECTOR_TYPE,
49
- indexer_config=LocalCliIndexerConfig,
50
- )
51
-
52
- local_dest_cmd = DestCmd(cmd_name=CONNECTOR_TYPE, uploader_config=LocalCliUploaderConfig)
@@ -1,72 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import DestCmd
6
- from unstructured_ingest.v2.cli.interfaces import CliConfig
7
- from unstructured_ingest.v2.processes.connectors.milvus import CONNECTOR_TYPE
8
-
9
-
10
- @dataclass
11
- class MilvusCliConnectionConfig(CliConfig):
12
- @staticmethod
13
- def get_cli_options() -> list[click.Option]:
14
- options = [
15
- click.Option(
16
- ["--uri"],
17
- required=False,
18
- type=str,
19
- default=None,
20
- help="Milvus uri, eg 'http://localhost:19530",
21
- ),
22
- click.Option(
23
- ["--user"],
24
- required=False,
25
- type=str,
26
- default=None,
27
- help="Milvus user",
28
- ),
29
- click.Option(
30
- ["--password"],
31
- required=False,
32
- type=str,
33
- default=None,
34
- help="Milvus password",
35
- ),
36
- click.Option(
37
- ["--db-name"],
38
- required=False,
39
- type=str,
40
- default=None,
41
- help="Milvus database name",
42
- ),
43
- ]
44
- return options
45
-
46
-
47
- @dataclass
48
- class MilvusCliUploaderConfig(CliConfig):
49
- @staticmethod
50
- def get_cli_options() -> list[click.Option]:
51
- options = [
52
- click.Option(
53
- ["--collection-name"],
54
- required=True,
55
- type=str,
56
- help="Milvus collections to write to",
57
- ),
58
- click.Option(
59
- ["--num-of-processes"],
60
- type=click.IntRange(min=1),
61
- default=4,
62
- help="number of processes to use when writing to support parallel writes",
63
- ),
64
- ]
65
- return options
66
-
67
-
68
- milvus_dest_cmd = DestCmd(
69
- cmd_name=CONNECTOR_TYPE,
70
- connection_config=MilvusCliConnectionConfig,
71
- uploader_config=MilvusCliUploaderConfig,
72
- )
@@ -1,62 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import DestCmd
6
- from unstructured_ingest.v2.cli.interfaces import CliConfig
7
- from unstructured_ingest.v2.processes.connectors.mongodb import CONNECTOR_TYPE
8
-
9
-
10
- @dataclass
11
- class MongoDBCliConnectionConfig(CliConfig):
12
- @staticmethod
13
- def get_cli_options() -> list[click.Option]:
14
- options = [
15
- click.Option(
16
- ["--uri"],
17
- help="URI to user when connecting",
18
- ),
19
- click.Option(
20
- ["--host"],
21
- help="hostname or IP address or Unix domain socket path of a single mongod or "
22
- "mongos instance to connect to, or a list of hostnames",
23
- ),
24
- click.Option(["--port"], type=int, default=27017),
25
- click.Option(
26
- ["--database"], type=str, required=True, help="database name to connect to"
27
- ),
28
- click.Option(
29
- ["--collection"], required=True, type=str, help="collection name to connect to"
30
- ),
31
- ]
32
- return options
33
-
34
-
35
- @dataclass
36
- class MongoDBCliUploaderConfig(CliConfig):
37
- @staticmethod
38
- def get_cli_options() -> list[click.Option]:
39
- options = [
40
- click.Option(
41
- ["--batch-size"],
42
- default=100,
43
- type=int,
44
- help="Number of records per batch",
45
- )
46
- ]
47
- return options
48
-
49
-
50
- @dataclass
51
- class MongoDBCliUploadStagerConfig(CliConfig):
52
- @staticmethod
53
- def get_cli_options() -> list[click.Option]:
54
- return []
55
-
56
-
57
- mongodb_dest_cmd = DestCmd(
58
- cmd_name=CONNECTOR_TYPE,
59
- connection_config=MongoDBCliConnectionConfig,
60
- uploader_config=MongoDBCliUploaderConfig,
61
- upload_stager_config=MongoDBCliUploadStagerConfig,
62
- )
@@ -1,91 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import SrcCmd
6
- from unstructured_ingest.v2.cli.interfaces import CliConfig
7
- from unstructured_ingest.v2.processes.connectors.onedrive import CONNECTOR_TYPE
8
-
9
-
10
- @dataclass
11
- class OnedriveCliConnectionConfig(CliConfig):
12
- @staticmethod
13
- def get_cli_options() -> list[click.Option]:
14
- options = [
15
- click.Option(
16
- ["--client-id"],
17
- required=True,
18
- type=str,
19
- help="Microsoft app client ID",
20
- ),
21
- click.Option(
22
- ["--client-cred"],
23
- required=True,
24
- type=str,
25
- help="Microsoft App client secret",
26
- ),
27
- click.Option(
28
- ["--user-pname"],
29
- required=True,
30
- type=str,
31
- help="User principal name, usually is your Azure AD email.",
32
- ),
33
- click.Option(
34
- ["--tenant"],
35
- default="common",
36
- type=str,
37
- help="ID or domain name associated with your Azure AD instance",
38
- ),
39
- click.Option(
40
- ["--authority-url"],
41
- default="https://login.microsoftonline.com",
42
- type=str,
43
- help="Authentication token provider for Microsoft apps, default is "
44
- "https://login.microsoftonline.com",
45
- ),
46
- ]
47
- return options
48
-
49
-
50
- @dataclass
51
- class OnedriveCliIndexerConfig(CliConfig):
52
- @staticmethod
53
- def get_cli_options() -> list[click.Option]:
54
- options = [
55
- click.Option(
56
- ["--path"],
57
- default=None,
58
- type=str,
59
- help="Folder to start parsing files from.",
60
- ),
61
- click.Option(
62
- ["--recursive"],
63
- is_flag=True,
64
- default=False,
65
- help="Recursively download files in their respective folders "
66
- "otherwise stop at the files in provided folder level.",
67
- ),
68
- ]
69
- return options
70
-
71
-
72
- @dataclass
73
- class OnedriveCliDownloadConfig(CliConfig):
74
- @staticmethod
75
- def get_cli_options() -> list[click.Option]:
76
- options = [
77
- click.Option(
78
- ["--download-dir"],
79
- help="Where files are downloaded to, defaults to a location at"
80
- "`$HOME/.cache/unstructured_ingest/<connector name>/<SHA256>`.",
81
- ),
82
- ]
83
- return options
84
-
85
-
86
- onedrive_drive_src_cmd = SrcCmd(
87
- cmd_name=CONNECTOR_TYPE,
88
- connection_config=OnedriveCliConnectionConfig,
89
- indexer_config=OnedriveCliIndexerConfig,
90
- downloader_config=OnedriveCliDownloadConfig,
91
- )
@@ -1,93 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
6
- from unstructured_ingest.v2.cli.cmds.elasticsearch import (
7
- ElasticsearchCliDownloadConfig,
8
- ElasticsearchCliIndexerConfig,
9
- ElasticsearchCliUploadStagerConfig,
10
- ElasticsearchUploaderConfig,
11
- )
12
- from unstructured_ingest.v2.cli.interfaces import CliConfig
13
- from unstructured_ingest.v2.cli.utils import DelimitedString
14
- from unstructured_ingest.v2.processes.connectors.opensearch import CONNECTOR_TYPE
15
-
16
-
17
- @dataclass
18
- class OpenSearchCliConnectionConfig(CliConfig):
19
- @staticmethod
20
- def get_cli_options() -> list[click.Option]:
21
- options = [
22
- click.Option(
23
- ["--hosts"],
24
- type=DelimitedString(),
25
- help='List of the OpenSearch hosts to connect to, e.g. "http://localhost:9200"',
26
- ),
27
- click.Option(
28
- ["--username"], type=str, default=None, help="username when using basic auth"
29
- ),
30
- click.Option(
31
- ["--password"],
32
- type=str,
33
- default=None,
34
- help="password when using basic auth",
35
- ),
36
- click.Option(
37
- ["--use-ssl"],
38
- type=bool,
39
- default=False,
40
- is_flag=True,
41
- help="use ssl for the connection",
42
- ),
43
- click.Option(
44
- ["--verify-certs"],
45
- type=bool,
46
- default=False,
47
- is_flag=True,
48
- help="whether to verify SSL certificates",
49
- ),
50
- click.Option(
51
- ["--ssl-show-warn"],
52
- type=bool,
53
- default=False,
54
- is_flag=True,
55
- help="show warning when verify certs is disabled",
56
- ),
57
- click.Option(
58
- ["--ca-certs"],
59
- type=click.Path(),
60
- default=None,
61
- help="path to CA bundle",
62
- ),
63
- click.Option(
64
- ["--client-cert"],
65
- type=click.Path(),
66
- default=None,
67
- help="path to the file containing the private key and the certificate,"
68
- " or cert only if using client_key",
69
- ),
70
- click.Option(
71
- ["--client-key"],
72
- type=click.Path(),
73
- default=None,
74
- help="path to the file containing the private key"
75
- " if using separate cert and key files",
76
- ),
77
- ]
78
- return options
79
-
80
-
81
- opensearch_src_cmd = SrcCmd(
82
- cmd_name=CONNECTOR_TYPE,
83
- connection_config=OpenSearchCliConnectionConfig,
84
- indexer_config=ElasticsearchCliIndexerConfig,
85
- downloader_config=ElasticsearchCliDownloadConfig,
86
- )
87
-
88
- opensearch_dest_cmd = DestCmd(
89
- cmd_name=CONNECTOR_TYPE,
90
- connection_config=OpenSearchCliConnectionConfig,
91
- upload_stager_config=ElasticsearchCliUploadStagerConfig,
92
- uploader_config=ElasticsearchUploaderConfig,
93
- )