unstructured-ingest 0.0.2.dev0__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cli.py +6 -1
- unstructured_ingest/cli/cmds/__init__.py +4 -4
- unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
- unstructured_ingest/cli/interfaces.py +13 -6
- unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
- unstructured_ingest/connector/biomed.py +12 -5
- unstructured_ingest/connector/confluence.py +3 -3
- unstructured_ingest/connector/github.py +3 -2
- unstructured_ingest/connector/google_drive.py +1 -2
- unstructured_ingest/connector/mongodb.py +1 -2
- unstructured_ingest/connector/notion/client.py +31 -16
- unstructured_ingest/connector/notion/connector.py +3 -2
- unstructured_ingest/connector/registry.py +2 -2
- unstructured_ingest/connector/vectara.py +7 -2
- unstructured_ingest/interfaces.py +13 -9
- unstructured_ingest/pipeline/interfaces.py +8 -3
- unstructured_ingest/pipeline/reformat/chunking.py +13 -9
- unstructured_ingest/pipeline/reformat/embedding.py +3 -3
- unstructured_ingest/runner/__init__.py +2 -2
- unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
- unstructured_ingest/utils/chunking.py +45 -0
- unstructured_ingest/utils/dep_check.py +1 -1
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/v2/cli/base/cmd.py +66 -12
- unstructured_ingest/v2/cli/base/dest.py +21 -12
- unstructured_ingest/v2/cli/base/src.py +35 -21
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
- unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
- unstructured_ingest/v2/interfaces/__init__.py +2 -1
- unstructured_ingest/v2/interfaces/connector.py +5 -7
- unstructured_ingest/v2/interfaces/downloader.py +17 -8
- unstructured_ingest/v2/interfaces/file_data.py +13 -2
- unstructured_ingest/v2/interfaces/indexer.py +3 -4
- unstructured_ingest/v2/interfaces/process.py +3 -4
- unstructured_ingest/v2/interfaces/processor.py +10 -10
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
- unstructured_ingest/v2/interfaces/uploader.py +3 -3
- unstructured_ingest/v2/pipeline/interfaces.py +3 -5
- unstructured_ingest/v2/pipeline/pipeline.py +73 -7
- unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
- unstructured_ingest/v2/pipeline/steps/download.py +90 -24
- unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +14 -10
- unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
- unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
- unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +74 -28
- unstructured_ingest/v2/processes/connector_registry.py +8 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
- unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +53 -35
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +38 -27
- unstructured_ingest/v2/processes/connectors/chroma.py +38 -27
- unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +95 -31
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +92 -53
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +47 -16
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +23 -13
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +18 -11
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +49 -61
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +46 -13
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +50 -20
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +27 -28
- unstructured_ingest/v2/processes/connectors/google_drive.py +52 -42
- unstructured_ingest/v2/processes/connectors/local.py +36 -28
- unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
- unstructured_ingest/v2/processes/connectors/mongodb.py +32 -22
- unstructured_ingest/v2/processes/connectors/onedrive.py +31 -16
- unstructured_ingest/v2/processes/connectors/opensearch.py +81 -43
- unstructured_ingest/v2/processes/connectors/pinecone.py +29 -23
- unstructured_ingest/v2/processes/connectors/salesforce.py +36 -26
- unstructured_ingest/v2/processes/connectors/sharepoint.py +64 -33
- unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
- unstructured_ingest/v2/processes/connectors/sql.py +52 -39
- unstructured_ingest/v2/processes/connectors/weaviate.py +35 -18
- unstructured_ingest/v2/processes/embedder.py +106 -47
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +79 -33
- unstructured_ingest/v2/processes/uncompress.py +3 -3
- unstructured_ingest/v2/utils.py +45 -0
- unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +92 -116
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
- unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
- unstructured_ingest/v2/cli/cmds/astra.py +0 -85
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
- unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -77
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
- unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
- unstructured_ingest/v2/cli/cmds/local.py +0 -60
- unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
- unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
- unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
- unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
- unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
- unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
- unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
- unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
- unstructured_ingest/v2/cli/cmds/sql.py +0 -84
- unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
- unstructured_ingest/v2/cli/configs/__init__.py +0 -6
- unstructured_ingest/v2/cli/configs/chunk.py +0 -89
- unstructured_ingest/v2/cli/configs/embed.py +0 -74
- unstructured_ingest/v2/cli/configs/partition.py +0 -99
- unstructured_ingest/v2/cli/configs/processor.py +0 -88
- unstructured_ingest/v2/cli/interfaces.py +0 -27
- unstructured_ingest/v2/pipeline/utils.py +0 -15
- unstructured_ingest-0.0.2.dev0.dist-info/METADATA +0 -321
- /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
|
@@ -1,77 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
6
|
-
from unstructured_ingest.v2.cli.utils import DelimitedString
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
@dataclass
|
|
10
|
-
class FsspecCliDownloadConfig(CliConfig):
|
|
11
|
-
@staticmethod
|
|
12
|
-
def get_cli_options() -> list[click.Option]:
|
|
13
|
-
return [
|
|
14
|
-
click.Option(
|
|
15
|
-
["--download-dir"],
|
|
16
|
-
help="Where files are downloaded to, defaults to a location at"
|
|
17
|
-
"`$HOME/.cache/unstructured_ingest/<connector name>/<SHA256>`.",
|
|
18
|
-
),
|
|
19
|
-
]
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
@dataclass
|
|
23
|
-
class FsspecCliFileConfig(CliConfig):
|
|
24
|
-
@staticmethod
|
|
25
|
-
def get_cli_options() -> list[click.Option]:
|
|
26
|
-
return [
|
|
27
|
-
click.Option(
|
|
28
|
-
["--remote-url"],
|
|
29
|
-
required=True,
|
|
30
|
-
help="Remote fsspec URL formatted as `protocol://dir/path`",
|
|
31
|
-
)
|
|
32
|
-
]
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
@dataclass
|
|
36
|
-
class FsspecCliUploaderConfig(FsspecCliFileConfig):
|
|
37
|
-
@staticmethod
|
|
38
|
-
def get_cli_options() -> list[click.Option]:
|
|
39
|
-
options = super(FsspecCliUploaderConfig, FsspecCliUploaderConfig).get_cli_options()
|
|
40
|
-
options.extend(
|
|
41
|
-
[
|
|
42
|
-
click.Option(
|
|
43
|
-
["--overwrite"],
|
|
44
|
-
is_flag=True,
|
|
45
|
-
default=False,
|
|
46
|
-
show_default=True,
|
|
47
|
-
help="If set, will overwrite content if content already exists",
|
|
48
|
-
)
|
|
49
|
-
]
|
|
50
|
-
)
|
|
51
|
-
return options
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
@dataclass
|
|
55
|
-
class FsspecCliIndexerConfig(FsspecCliFileConfig):
|
|
56
|
-
@staticmethod
|
|
57
|
-
def get_cli_options() -> list[click.Option]:
|
|
58
|
-
options = super(FsspecCliIndexerConfig, FsspecCliIndexerConfig).get_cli_options()
|
|
59
|
-
options.extend(
|
|
60
|
-
[
|
|
61
|
-
click.Option(
|
|
62
|
-
["--recursive"],
|
|
63
|
-
is_flag=True,
|
|
64
|
-
default=False,
|
|
65
|
-
help="Recursively download files in their respective folders "
|
|
66
|
-
"otherwise stop at the files in provided folder level.",
|
|
67
|
-
),
|
|
68
|
-
click.Option(
|
|
69
|
-
["--file-glob"],
|
|
70
|
-
default=None,
|
|
71
|
-
type=DelimitedString(),
|
|
72
|
-
help="A comma-separated list of file globs to limit which types of "
|
|
73
|
-
"local files are accepted, e.g. '*.html,*.txt'",
|
|
74
|
-
),
|
|
75
|
-
]
|
|
76
|
-
)
|
|
77
|
-
return options
|
|
@@ -1,81 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.cmds.fsspec.fsspec import (
|
|
7
|
-
FsspecCliDownloadConfig,
|
|
8
|
-
FsspecCliIndexerConfig,
|
|
9
|
-
FsspecCliUploaderConfig,
|
|
10
|
-
)
|
|
11
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
12
|
-
from unstructured_ingest.v2.cli.utils import FileOrJson
|
|
13
|
-
from unstructured_ingest.v2.processes.connectors.fsspec.gcs import (
|
|
14
|
-
CONNECTOR_TYPE,
|
|
15
|
-
)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
@dataclass
|
|
19
|
-
class GcsCliDownloadConfig(FsspecCliDownloadConfig):
|
|
20
|
-
pass
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
@dataclass
|
|
24
|
-
class GcsCliIndexerConfig(FsspecCliIndexerConfig):
|
|
25
|
-
pass
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
@dataclass
|
|
29
|
-
class GcsCliConnectionConfig(CliConfig):
|
|
30
|
-
@staticmethod
|
|
31
|
-
def get_cli_options() -> list[click.Option]:
|
|
32
|
-
help_string = """
|
|
33
|
-
Options:
|
|
34
|
-
- ``None``, GCSFS will attempt to guess your credentials in the
|
|
35
|
-
following order: gcloud CLI default, gcsfs cached token, google compute
|
|
36
|
-
metadata service, anonymous.
|
|
37
|
-
- ``'google_default'``, your default gcloud credentials will be used,
|
|
38
|
-
which are typically established by doing ``gcloud login`` in a terminal.
|
|
39
|
-
- ``'cache'``, credentials from previously successful gcsfs
|
|
40
|
-
authentication will be used (use this after "browser" auth succeeded)
|
|
41
|
-
- ``'anon'``, no authentication is performed, and you can only
|
|
42
|
-
access data which is accessible to allUsers (in this case, the project and
|
|
43
|
-
access level parameters are meaningless)
|
|
44
|
-
- ``'browser'``, you get an access code with which you can
|
|
45
|
-
authenticate via a specially provided URL
|
|
46
|
-
- if ``'cloud'``, we assume we are running within google compute
|
|
47
|
-
or google container engine, and query the internal metadata directly for
|
|
48
|
-
a token.
|
|
49
|
-
- you may supply a token generated by the
|
|
50
|
-
[gcloud](https://cloud.google.com/sdk/docs/)
|
|
51
|
-
utility; this is either a python dictionary or the name of a file
|
|
52
|
-
containing the JSON returned by logging in with the gcloud CLI tool.
|
|
53
|
-
"""
|
|
54
|
-
options = [
|
|
55
|
-
click.Option(
|
|
56
|
-
["--service-account-key"],
|
|
57
|
-
default=None,
|
|
58
|
-
type=FileOrJson(allow_raw_str=True),
|
|
59
|
-
help=help_string,
|
|
60
|
-
),
|
|
61
|
-
]
|
|
62
|
-
return options
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
@dataclass
|
|
66
|
-
class GcsUploaderConfig(FsspecCliUploaderConfig):
|
|
67
|
-
pass
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
gcs_src_cmd = SrcCmd(
|
|
71
|
-
cmd_name=CONNECTOR_TYPE,
|
|
72
|
-
indexer_config=GcsCliIndexerConfig,
|
|
73
|
-
connection_config=GcsCliConnectionConfig,
|
|
74
|
-
downloader_config=GcsCliDownloadConfig,
|
|
75
|
-
)
|
|
76
|
-
|
|
77
|
-
gcs_dest_cmd = DestCmd(
|
|
78
|
-
cmd_name=CONNECTOR_TYPE,
|
|
79
|
-
connection_config=GcsCliConnectionConfig,
|
|
80
|
-
uploader_config=GcsUploaderConfig,
|
|
81
|
-
)
|
|
@@ -1,84 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.cmds.fsspec.fsspec import (
|
|
7
|
-
FsspecCliDownloadConfig,
|
|
8
|
-
FsspecCliIndexerConfig,
|
|
9
|
-
FsspecCliUploaderConfig,
|
|
10
|
-
)
|
|
11
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
12
|
-
from unstructured_ingest.v2.processes.connectors.fsspec.s3 import (
|
|
13
|
-
CONNECTOR_TYPE,
|
|
14
|
-
)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
@dataclass
|
|
18
|
-
class S3CliDownloadConfig(FsspecCliDownloadConfig):
|
|
19
|
-
pass
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
@dataclass
|
|
23
|
-
class S3CliIndexerConfig(FsspecCliIndexerConfig):
|
|
24
|
-
pass
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
@dataclass
|
|
28
|
-
class S3CliConnectionConfig(CliConfig):
|
|
29
|
-
@staticmethod
|
|
30
|
-
def get_cli_options() -> list[click.Option]:
|
|
31
|
-
options = [
|
|
32
|
-
click.Option(
|
|
33
|
-
["--anonymous"],
|
|
34
|
-
is_flag=True,
|
|
35
|
-
default=False,
|
|
36
|
-
help="Connect to s3 without local AWS credentials.",
|
|
37
|
-
),
|
|
38
|
-
click.Option(
|
|
39
|
-
["--endpoint-url"],
|
|
40
|
-
type=str,
|
|
41
|
-
default=None,
|
|
42
|
-
help="Use this endpoint_url, if specified. Needed for "
|
|
43
|
-
"connecting to non-AWS S3 buckets.",
|
|
44
|
-
),
|
|
45
|
-
click.Option(
|
|
46
|
-
["--key"],
|
|
47
|
-
type=str,
|
|
48
|
-
default=None,
|
|
49
|
-
help="If not anonymous, use this access key ID, if specified. Takes precedence "
|
|
50
|
-
"over `aws_access_key_id` in client_kwargs.",
|
|
51
|
-
),
|
|
52
|
-
click.Option(
|
|
53
|
-
["--secret"],
|
|
54
|
-
type=str,
|
|
55
|
-
default=None,
|
|
56
|
-
help="If not anonymous, use this secret access key, if specified.",
|
|
57
|
-
),
|
|
58
|
-
click.Option(
|
|
59
|
-
["--token"],
|
|
60
|
-
type=str,
|
|
61
|
-
default=None,
|
|
62
|
-
help="If not anonymous, use this security token, if specified.",
|
|
63
|
-
),
|
|
64
|
-
]
|
|
65
|
-
return options
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
@dataclass
|
|
69
|
-
class S3UploaderConfig(FsspecCliUploaderConfig):
|
|
70
|
-
pass
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
s3_src_cmd = SrcCmd(
|
|
74
|
-
cmd_name=CONNECTOR_TYPE,
|
|
75
|
-
indexer_config=S3CliIndexerConfig,
|
|
76
|
-
connection_config=S3CliConnectionConfig,
|
|
77
|
-
downloader_config=S3CliDownloadConfig,
|
|
78
|
-
)
|
|
79
|
-
|
|
80
|
-
s3_dest_cmd = DestCmd(
|
|
81
|
-
cmd_name=CONNECTOR_TYPE,
|
|
82
|
-
connection_config=S3CliConnectionConfig,
|
|
83
|
-
uploader_config=S3UploaderConfig,
|
|
84
|
-
)
|
|
@@ -1,80 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.cmds.fsspec.fsspec import (
|
|
7
|
-
FsspecCliDownloadConfig,
|
|
8
|
-
FsspecCliIndexerConfig,
|
|
9
|
-
FsspecCliUploaderConfig,
|
|
10
|
-
)
|
|
11
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
12
|
-
from unstructured_ingest.v2.processes.connectors.fsspec.sftp import (
|
|
13
|
-
CONNECTOR_TYPE,
|
|
14
|
-
)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
@dataclass
|
|
18
|
-
class SftpCliDownloadConfig(FsspecCliDownloadConfig):
|
|
19
|
-
pass
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
@dataclass
|
|
23
|
-
class SftpCliIndexerConfig(FsspecCliIndexerConfig):
|
|
24
|
-
pass
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
@dataclass
|
|
28
|
-
class SftpCliConnectionConfig(CliConfig):
|
|
29
|
-
@staticmethod
|
|
30
|
-
def get_cli_options() -> list[click.Option]:
|
|
31
|
-
options = [
|
|
32
|
-
click.Option(
|
|
33
|
-
["--username"],
|
|
34
|
-
required=True,
|
|
35
|
-
type=str,
|
|
36
|
-
help="Username for sftp connection",
|
|
37
|
-
),
|
|
38
|
-
click.Option(
|
|
39
|
-
["--password"],
|
|
40
|
-
required=True,
|
|
41
|
-
type=str,
|
|
42
|
-
help="Password for sftp connection",
|
|
43
|
-
),
|
|
44
|
-
click.Option(
|
|
45
|
-
["--look-for-keys"],
|
|
46
|
-
required=False,
|
|
47
|
-
default=False,
|
|
48
|
-
is_flag=True,
|
|
49
|
-
type=bool,
|
|
50
|
-
help="Whether to search for private key files in ~/.ssh/",
|
|
51
|
-
),
|
|
52
|
-
click.Option(
|
|
53
|
-
["--allow-agent"],
|
|
54
|
-
required=False,
|
|
55
|
-
default=False,
|
|
56
|
-
is_flag=True,
|
|
57
|
-
type=bool,
|
|
58
|
-
help="Whether to connect to the SSH agent.",
|
|
59
|
-
),
|
|
60
|
-
]
|
|
61
|
-
return options
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
@dataclass
|
|
65
|
-
class SftpUploaderConfig(FsspecCliUploaderConfig):
|
|
66
|
-
pass
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
sftp_src_cmd = SrcCmd(
|
|
70
|
-
cmd_name=CONNECTOR_TYPE,
|
|
71
|
-
indexer_config=SftpCliIndexerConfig,
|
|
72
|
-
connection_config=SftpCliConnectionConfig,
|
|
73
|
-
downloader_config=SftpCliDownloadConfig,
|
|
74
|
-
)
|
|
75
|
-
|
|
76
|
-
sftp_dest_cmd = DestCmd(
|
|
77
|
-
cmd_name=CONNECTOR_TYPE,
|
|
78
|
-
connection_config=SftpCliConnectionConfig,
|
|
79
|
-
uploader_config=SftpUploaderConfig,
|
|
80
|
-
)
|
|
@@ -1,74 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import SrcCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
-
from unstructured_ingest.v2.cli.utils import DelimitedString, FileOrJson
|
|
8
|
-
from unstructured_ingest.v2.processes.connectors.google_drive import CONNECTOR_TYPE
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@dataclass
|
|
12
|
-
class GoogleDriveCliConnectionConfig(CliConfig):
|
|
13
|
-
@staticmethod
|
|
14
|
-
def get_cli_options() -> list[click.Option]:
|
|
15
|
-
options = [
|
|
16
|
-
click.Option(
|
|
17
|
-
["--drive-id"],
|
|
18
|
-
required=True,
|
|
19
|
-
type=str,
|
|
20
|
-
help="Google Drive File or Folder ID.",
|
|
21
|
-
),
|
|
22
|
-
click.Option(
|
|
23
|
-
["--service-account-key"],
|
|
24
|
-
required=True,
|
|
25
|
-
type=FileOrJson(),
|
|
26
|
-
help="Either the file path of the credentials file to use or a json string of "
|
|
27
|
-
"those values to use for authentication",
|
|
28
|
-
),
|
|
29
|
-
]
|
|
30
|
-
return options
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
@dataclass
|
|
34
|
-
class GoogleDriveCliIndexerConfig(CliConfig):
|
|
35
|
-
@staticmethod
|
|
36
|
-
def get_cli_options() -> list[click.Option]:
|
|
37
|
-
options = [
|
|
38
|
-
click.Option(
|
|
39
|
-
["--extensions"],
|
|
40
|
-
default=None,
|
|
41
|
-
type=DelimitedString(),
|
|
42
|
-
help="Filters the files to be processed based on extension e.g. jpg, docx, etc.",
|
|
43
|
-
),
|
|
44
|
-
click.Option(
|
|
45
|
-
["--recursive"],
|
|
46
|
-
is_flag=True,
|
|
47
|
-
default=False,
|
|
48
|
-
help="Recursively download files in their respective folders "
|
|
49
|
-
"otherwise stop at the files in provided folder level.",
|
|
50
|
-
),
|
|
51
|
-
]
|
|
52
|
-
return options
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
@dataclass
|
|
56
|
-
class GoogleDriveCliDownloadConfig(CliConfig):
|
|
57
|
-
@staticmethod
|
|
58
|
-
def get_cli_options() -> list[click.Option]:
|
|
59
|
-
options = [
|
|
60
|
-
click.Option(
|
|
61
|
-
["--download-dir"],
|
|
62
|
-
help="Where files are downloaded to, defaults to a location at"
|
|
63
|
-
"`$HOME/.cache/unstructured_ingest/<connector name>/<SHA256>`.",
|
|
64
|
-
),
|
|
65
|
-
]
|
|
66
|
-
return options
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
google_drive_src_cmd = SrcCmd(
|
|
70
|
-
cmd_name=CONNECTOR_TYPE,
|
|
71
|
-
connection_config=GoogleDriveCliConnectionConfig,
|
|
72
|
-
indexer_config=GoogleDriveCliIndexerConfig,
|
|
73
|
-
downloader_config=GoogleDriveCliDownloadConfig,
|
|
74
|
-
)
|
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
-
from unstructured_ingest.v2.cli.utils import DelimitedString
|
|
8
|
-
from unstructured_ingest.v2.processes.connectors.local import CONNECTOR_TYPE
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@dataclass
|
|
12
|
-
class LocalCliIndexerConfig(CliConfig):
|
|
13
|
-
@staticmethod
|
|
14
|
-
def get_cli_options() -> list[click.Option]:
|
|
15
|
-
options = [
|
|
16
|
-
click.Option(
|
|
17
|
-
["--input-path"],
|
|
18
|
-
required=True,
|
|
19
|
-
type=click.Path(file_okay=True, dir_okay=True, exists=True),
|
|
20
|
-
help="Path to the location in the local file system that will be processed.",
|
|
21
|
-
),
|
|
22
|
-
click.Option(
|
|
23
|
-
["--file-glob"],
|
|
24
|
-
default=None,
|
|
25
|
-
type=DelimitedString(),
|
|
26
|
-
help="A comma-separated list of file globs to limit which types of "
|
|
27
|
-
"local files are accepted, e.g. '*.html,*.txt'",
|
|
28
|
-
),
|
|
29
|
-
click.Option(
|
|
30
|
-
["--recursive"],
|
|
31
|
-
is_flag=True,
|
|
32
|
-
default=False,
|
|
33
|
-
help="Recursively download files in their respective folders "
|
|
34
|
-
"otherwise stop at the files in provided folder level.",
|
|
35
|
-
),
|
|
36
|
-
]
|
|
37
|
-
return options
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
@dataclass
|
|
41
|
-
class LocalCliUploaderConfig(CliConfig):
|
|
42
|
-
@staticmethod
|
|
43
|
-
def get_cli_options() -> list[click.Option]:
|
|
44
|
-
options = [
|
|
45
|
-
click.Option(
|
|
46
|
-
["--output-dir"],
|
|
47
|
-
required=True,
|
|
48
|
-
type=str,
|
|
49
|
-
help="Local path to write partitioned output to",
|
|
50
|
-
)
|
|
51
|
-
]
|
|
52
|
-
return options
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
local_src_cmd = SrcCmd(
|
|
56
|
-
cmd_name=CONNECTOR_TYPE,
|
|
57
|
-
indexer_config=LocalCliIndexerConfig,
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
local_dest_cmd = DestCmd(cmd_name=CONNECTOR_TYPE, uploader_config=LocalCliUploaderConfig)
|
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import DestCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
-
from unstructured_ingest.v2.processes.connectors.milvus import CONNECTOR_TYPE
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@dataclass
|
|
11
|
-
class MilvusCliConnectionConfig(CliConfig):
|
|
12
|
-
@staticmethod
|
|
13
|
-
def get_cli_options() -> list[click.Option]:
|
|
14
|
-
options = [
|
|
15
|
-
click.Option(
|
|
16
|
-
["--uri"],
|
|
17
|
-
required=False,
|
|
18
|
-
type=str,
|
|
19
|
-
default=None,
|
|
20
|
-
help="Milvus uri, eg 'http://localhost:19530",
|
|
21
|
-
),
|
|
22
|
-
click.Option(
|
|
23
|
-
["--user"],
|
|
24
|
-
required=False,
|
|
25
|
-
type=str,
|
|
26
|
-
default=None,
|
|
27
|
-
help="Milvus user",
|
|
28
|
-
),
|
|
29
|
-
click.Option(
|
|
30
|
-
["--password"],
|
|
31
|
-
required=False,
|
|
32
|
-
type=str,
|
|
33
|
-
default=None,
|
|
34
|
-
help="Milvus password",
|
|
35
|
-
),
|
|
36
|
-
click.Option(
|
|
37
|
-
["--db-name"],
|
|
38
|
-
required=False,
|
|
39
|
-
type=str,
|
|
40
|
-
default=None,
|
|
41
|
-
help="Milvus database name",
|
|
42
|
-
),
|
|
43
|
-
]
|
|
44
|
-
return options
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
@dataclass
|
|
48
|
-
class MilvusCliUploaderConfig(CliConfig):
|
|
49
|
-
@staticmethod
|
|
50
|
-
def get_cli_options() -> list[click.Option]:
|
|
51
|
-
options = [
|
|
52
|
-
click.Option(
|
|
53
|
-
["--collection-name"],
|
|
54
|
-
required=True,
|
|
55
|
-
type=str,
|
|
56
|
-
help="Milvus collections to write to",
|
|
57
|
-
),
|
|
58
|
-
click.Option(
|
|
59
|
-
["--num-of-processes"],
|
|
60
|
-
type=click.IntRange(min=1),
|
|
61
|
-
default=4,
|
|
62
|
-
help="number of processes to use when writing to support parallel writes",
|
|
63
|
-
),
|
|
64
|
-
]
|
|
65
|
-
return options
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
milvus_dest_cmd = DestCmd(
|
|
69
|
-
cmd_name=CONNECTOR_TYPE,
|
|
70
|
-
connection_config=MilvusCliConnectionConfig,
|
|
71
|
-
uploader_config=MilvusCliUploaderConfig,
|
|
72
|
-
)
|
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import DestCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
-
from unstructured_ingest.v2.processes.connectors.mongodb import CONNECTOR_TYPE
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@dataclass
|
|
11
|
-
class MongoDBCliConnectionConfig(CliConfig):
|
|
12
|
-
@staticmethod
|
|
13
|
-
def get_cli_options() -> list[click.Option]:
|
|
14
|
-
options = [
|
|
15
|
-
click.Option(
|
|
16
|
-
["--uri"],
|
|
17
|
-
help="URI to user when connecting",
|
|
18
|
-
),
|
|
19
|
-
click.Option(
|
|
20
|
-
["--host"],
|
|
21
|
-
help="hostname or IP address or Unix domain socket path of a single mongod or "
|
|
22
|
-
"mongos instance to connect to, or a list of hostnames",
|
|
23
|
-
),
|
|
24
|
-
click.Option(["--port"], type=int, default=27017),
|
|
25
|
-
click.Option(
|
|
26
|
-
["--database"], type=str, required=True, help="database name to connect to"
|
|
27
|
-
),
|
|
28
|
-
click.Option(
|
|
29
|
-
["--collection"], required=True, type=str, help="collection name to connect to"
|
|
30
|
-
),
|
|
31
|
-
]
|
|
32
|
-
return options
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
@dataclass
|
|
36
|
-
class MongoDBCliUploaderConfig(CliConfig):
|
|
37
|
-
@staticmethod
|
|
38
|
-
def get_cli_options() -> list[click.Option]:
|
|
39
|
-
options = [
|
|
40
|
-
click.Option(
|
|
41
|
-
["--batch-size"],
|
|
42
|
-
default=100,
|
|
43
|
-
type=int,
|
|
44
|
-
help="Number of records per batch",
|
|
45
|
-
)
|
|
46
|
-
]
|
|
47
|
-
return options
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
@dataclass
|
|
51
|
-
class MongoDBCliUploadStagerConfig(CliConfig):
|
|
52
|
-
@staticmethod
|
|
53
|
-
def get_cli_options() -> list[click.Option]:
|
|
54
|
-
return []
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
mongodb_dest_cmd = DestCmd(
|
|
58
|
-
cmd_name=CONNECTOR_TYPE,
|
|
59
|
-
connection_config=MongoDBCliConnectionConfig,
|
|
60
|
-
uploader_config=MongoDBCliUploaderConfig,
|
|
61
|
-
upload_stager_config=MongoDBCliUploadStagerConfig,
|
|
62
|
-
)
|
|
@@ -1,91 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import SrcCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
-
from unstructured_ingest.v2.processes.connectors.onedrive import CONNECTOR_TYPE
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@dataclass
|
|
11
|
-
class OnedriveCliConnectionConfig(CliConfig):
|
|
12
|
-
@staticmethod
|
|
13
|
-
def get_cli_options() -> list[click.Option]:
|
|
14
|
-
options = [
|
|
15
|
-
click.Option(
|
|
16
|
-
["--client-id"],
|
|
17
|
-
required=True,
|
|
18
|
-
type=str,
|
|
19
|
-
help="Microsoft app client ID",
|
|
20
|
-
),
|
|
21
|
-
click.Option(
|
|
22
|
-
["--client-cred"],
|
|
23
|
-
required=True,
|
|
24
|
-
type=str,
|
|
25
|
-
help="Microsoft App client secret",
|
|
26
|
-
),
|
|
27
|
-
click.Option(
|
|
28
|
-
["--user-pname"],
|
|
29
|
-
required=True,
|
|
30
|
-
type=str,
|
|
31
|
-
help="User principal name, usually is your Azure AD email.",
|
|
32
|
-
),
|
|
33
|
-
click.Option(
|
|
34
|
-
["--tenant"],
|
|
35
|
-
default="common",
|
|
36
|
-
type=str,
|
|
37
|
-
help="ID or domain name associated with your Azure AD instance",
|
|
38
|
-
),
|
|
39
|
-
click.Option(
|
|
40
|
-
["--authority-url"],
|
|
41
|
-
default="https://login.microsoftonline.com",
|
|
42
|
-
type=str,
|
|
43
|
-
help="Authentication token provider for Microsoft apps, default is "
|
|
44
|
-
"https://login.microsoftonline.com",
|
|
45
|
-
),
|
|
46
|
-
]
|
|
47
|
-
return options
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
@dataclass
|
|
51
|
-
class OnedriveCliIndexerConfig(CliConfig):
|
|
52
|
-
@staticmethod
|
|
53
|
-
def get_cli_options() -> list[click.Option]:
|
|
54
|
-
options = [
|
|
55
|
-
click.Option(
|
|
56
|
-
["--path"],
|
|
57
|
-
default=None,
|
|
58
|
-
type=str,
|
|
59
|
-
help="Folder to start parsing files from.",
|
|
60
|
-
),
|
|
61
|
-
click.Option(
|
|
62
|
-
["--recursive"],
|
|
63
|
-
is_flag=True,
|
|
64
|
-
default=False,
|
|
65
|
-
help="Recursively download files in their respective folders "
|
|
66
|
-
"otherwise stop at the files in provided folder level.",
|
|
67
|
-
),
|
|
68
|
-
]
|
|
69
|
-
return options
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
@dataclass
|
|
73
|
-
class OnedriveCliDownloadConfig(CliConfig):
|
|
74
|
-
@staticmethod
|
|
75
|
-
def get_cli_options() -> list[click.Option]:
|
|
76
|
-
options = [
|
|
77
|
-
click.Option(
|
|
78
|
-
["--download-dir"],
|
|
79
|
-
help="Where files are downloaded to, defaults to a location at"
|
|
80
|
-
"`$HOME/.cache/unstructured_ingest/<connector name>/<SHA256>`.",
|
|
81
|
-
),
|
|
82
|
-
]
|
|
83
|
-
return options
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
onedrive_drive_src_cmd = SrcCmd(
|
|
87
|
-
cmd_name=CONNECTOR_TYPE,
|
|
88
|
-
connection_config=OnedriveCliConnectionConfig,
|
|
89
|
-
indexer_config=OnedriveCliIndexerConfig,
|
|
90
|
-
downloader_config=OnedriveCliDownloadConfig,
|
|
91
|
-
)
|