datachain 0.7.11__tar.gz → 0.8.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.7.11 → datachain-0.8.1}/.github/workflows/benchmarks.yml +1 -1
- {datachain-0.7.11 → datachain-0.8.1}/.github/workflows/release.yml +1 -1
- {datachain-0.7.11 → datachain-0.8.1}/.github/workflows/tests-studio.yml +1 -1
- {datachain-0.7.11 → datachain-0.8.1}/.github/workflows/tests.yml +3 -3
- {datachain-0.7.11 → datachain-0.8.1}/.pre-commit-config.yaml +1 -1
- {datachain-0.7.11/src/datachain.egg-info → datachain-0.8.1}/PKG-INFO +4 -3
- {datachain-0.7.11 → datachain-0.8.1}/docs/quick-start.md +4 -2
- datachain-0.8.1/examples/get_started/json-csv-reader.py +82 -0
- {datachain-0.7.11 → datachain-0.8.1}/pyproject.toml +4 -3
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/catalog/catalog.py +56 -45
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/cli.py +25 -3
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/client/gcs.py +9 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/data_storage/sqlite.py +20 -6
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/data_storage/warehouse.py +0 -1
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/arrow.py +82 -58
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/dc.py +167 -166
- datachain-0.8.1/src/datachain/lib/diff.py +197 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/file.py +3 -1
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/listing.py +44 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/meta_formats.py +38 -42
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/udf.py +0 -1
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/query/batch.py +32 -6
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/query/dataset.py +18 -17
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/query/dispatch.py +125 -125
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/query/session.py +8 -5
- datachain-0.8.1/src/datachain/query/udf.py +20 -0
- datachain-0.8.1/src/datachain/query/utils.py +42 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/remote/studio.py +53 -1
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/studio.py +47 -2
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/utils.py +1 -1
- {datachain-0.7.11 → datachain-0.8.1/src/datachain.egg-info}/PKG-INFO +4 -3
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain.egg-info/SOURCES.txt +6 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain.egg-info/requires.txt +3 -2
- {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_catalog.py +6 -2
- {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_datachain.py +1 -1
- {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_meta_formats.py +4 -4
- {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_pull.py +18 -12
- datachain-0.8.1/tests/func/test_session.py +25 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/test_cli_studio.py +52 -1
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_arrow.py +26 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_datachain.py +3 -3
- datachain-0.8.1/tests/unit/lib/test_diff.py +498 -0
- datachain-0.8.1/tests/unit/test_client_gcs.py +17 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_listing.py +29 -2
- datachain-0.7.11/examples/get_started/json-csv-reader.py +0 -101
- {datachain-0.7.11 → datachain-0.8.1}/.cruft.json +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/.gitattributes +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/.github/codecov.yaml +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/.github/dependabot.yml +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/.gitignore +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/LICENSE +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/README.rst +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/docs/assets/datachain.svg +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/docs/contributing.md +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/docs/examples.md +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/docs/index.md +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/docs/overrides/main.html +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/docs/references/datachain.md +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/docs/references/datatype.md +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/docs/references/file.md +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/docs/references/index.md +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/docs/references/sql.md +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/docs/references/torch.md +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/docs/references/udf.md +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/docs/tutorials.md +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/examples/multimodal/wds.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/mkdocs.yml +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/noxfile.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/setup.cfg +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/__init__.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/__main__.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/asyn.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/cache.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/cli_utils.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/client/__init__.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/client/azure.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/client/hf.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/client/local.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/client/s3.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/config.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/dataset.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/error.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/func/__init__.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/func/array.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/func/base.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/func/conditional.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/func/func.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/func/numeric.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/func/path.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/func/random.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/func/string.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/func/window.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/job.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/clip.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/hf.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/image.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/settings.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/tar.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/text.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/utils.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/listing.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/model/__init__.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/model/bbox.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/model/pose.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/model/segment.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/node.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/progress.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/py.typed +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/query/__init__.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/query/metrics.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/query/params.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/query/queue.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/query/schema.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/types.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/sql/utils.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/telemetry.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/__init__.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/conftest.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/data.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/examples/__init__.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/examples/test_examples.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/examples/wds_data.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/func/__init__.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_client.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_datasets.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_listing.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_ls.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_metrics.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_pytorch.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_query.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/func/test_toolkit.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/scripts/feature_class.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/test_atomicity.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/test_cli_e2e.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/test_query_e2e.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/test_telemetry.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/__init__.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_models.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_asyn.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_cache.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_catalog.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_client.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_config.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_dataset.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_func.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_metastore.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_query.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_query_params.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_serializer.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_session.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_utils.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.7.11 → datachain-0.8.1}/tests/utils.py +0 -0
|
@@ -37,7 +37,7 @@ jobs:
|
|
|
37
37
|
python-version: '3.9'
|
|
38
38
|
|
|
39
39
|
- name: Setup uv
|
|
40
|
-
uses: astral-sh/setup-uv@
|
|
40
|
+
uses: astral-sh/setup-uv@v5
|
|
41
41
|
with:
|
|
42
42
|
enable-cache: true
|
|
43
43
|
cache-suffix: lint
|
|
@@ -94,7 +94,7 @@ jobs:
|
|
|
94
94
|
python-version: ${{ matrix.pyv }}
|
|
95
95
|
|
|
96
96
|
- name: Setup uv
|
|
97
|
-
uses: astral-sh/setup-uv@
|
|
97
|
+
uses: astral-sh/setup-uv@v5
|
|
98
98
|
with:
|
|
99
99
|
enable-cache: true
|
|
100
100
|
cache-suffix: tests-${{ matrix.pyv }}
|
|
@@ -157,7 +157,7 @@ jobs:
|
|
|
157
157
|
python-version: ${{ matrix.pyv }}
|
|
158
158
|
|
|
159
159
|
- name: Setup uv
|
|
160
|
-
uses: astral-sh/setup-uv@
|
|
160
|
+
uses: astral-sh/setup-uv@v5
|
|
161
161
|
with:
|
|
162
162
|
enable-cache: true
|
|
163
163
|
cache-suffix: examples-${{ matrix.pyv }}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.1
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -46,6 +46,7 @@ Requires-Dist: iterative-telemetry>=0.0.9
|
|
|
46
46
|
Requires-Dist: platformdirs
|
|
47
47
|
Requires-Dist: dvc-studio-client<1,>=0.21
|
|
48
48
|
Requires-Dist: tabulate
|
|
49
|
+
Requires-Dist: websockets
|
|
49
50
|
Provides-Extra: docs
|
|
50
51
|
Requires-Dist: mkdocs>=1.5.2; extra == "docs"
|
|
51
52
|
Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
|
|
@@ -83,7 +84,7 @@ Requires-Dist: requests-mock; extra == "tests"
|
|
|
83
84
|
Requires-Dist: scipy; extra == "tests"
|
|
84
85
|
Provides-Extra: dev
|
|
85
86
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
86
|
-
Requires-Dist: mypy==1.
|
|
87
|
+
Requires-Dist: mypy==1.14.0; extra == "dev"
|
|
87
88
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
88
89
|
Requires-Dist: types-pytz; extra == "dev"
|
|
89
90
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
@@ -98,7 +99,7 @@ Requires-Dist: unstructured[pdf]; extra == "examples"
|
|
|
98
99
|
Requires-Dist: pdfplumber==0.11.4; extra == "examples"
|
|
99
100
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
100
101
|
Requires-Dist: onnx==1.16.1; extra == "examples"
|
|
101
|
-
Requires-Dist: ultralytics==8.3.
|
|
102
|
+
Requires-Dist: ultralytics==8.3.53; extra == "examples"
|
|
102
103
|
|
|
103
104
|
================
|
|
104
105
|
|logo| DataChain
|
|
@@ -59,6 +59,8 @@ Batch inference with a simple sentiment model using the
|
|
|
59
59
|
pip install transformers
|
|
60
60
|
```
|
|
61
61
|
|
|
62
|
+
Note, `transformers` works only if `torch`, `tensorflow` >= 2.0, or `flax` are installed.
|
|
63
|
+
|
|
62
64
|
The code below downloads files from the cloud, and applies a
|
|
63
65
|
user-defined function to each one of them. All files with a positive
|
|
64
66
|
sentiment detected are then copied to the local directory.
|
|
@@ -114,13 +116,14 @@ DataChain can parallelize API calls; the free Mistral tier supports up
|
|
|
114
116
|
to 4 requests at the same time.
|
|
115
117
|
|
|
116
118
|
``` py
|
|
119
|
+
import os
|
|
117
120
|
from mistralai import Mistral
|
|
118
121
|
from datachain import File, DataChain, Column
|
|
119
122
|
|
|
120
123
|
PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
|
|
121
124
|
|
|
122
125
|
def eval_dialogue(file: File) -> bool:
|
|
123
|
-
client = Mistral()
|
|
126
|
+
client = Mistral(api_key = os.environ["MISTRAL_API_KEY"])
|
|
124
127
|
response = client.chat.complete(
|
|
125
128
|
model="open-mixtral-8x22b",
|
|
126
129
|
messages=[{"role": "system", "content": PROMPT},
|
|
@@ -130,7 +133,6 @@ def eval_dialogue(file: File) -> bool:
|
|
|
130
133
|
|
|
131
134
|
chain = (
|
|
132
135
|
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
|
|
133
|
-
.settings(parallel=4, cache=True)
|
|
134
136
|
.map(is_success=eval_dialogue)
|
|
135
137
|
.save("mistral_files")
|
|
136
138
|
)
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
|
|
5
|
+
from datachain import C, DataChain
|
|
6
|
+
from datachain.lib.data_model import ModelStore
|
|
7
|
+
from datachain.lib.meta_formats import gen_datamodel_code
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# Sample model for static JSON model
|
|
11
|
+
class LicenseModel(BaseModel):
|
|
12
|
+
url: str
|
|
13
|
+
id: int
|
|
14
|
+
name: str
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
LicenseFeature = ModelStore.register(LicenseModel)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# Sample model for static CSV model
|
|
21
|
+
class ChatDialog(BaseModel):
|
|
22
|
+
id: Optional[int] = None
|
|
23
|
+
count: Optional[int] = None
|
|
24
|
+
sender: Optional[str] = None
|
|
25
|
+
text: Optional[str] = None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
ChatFeature = ModelStore.register(ChatDialog)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def main():
|
|
32
|
+
# Dynamic JSONl schema from 2 objects
|
|
33
|
+
uri = "gs://datachain-demo/jsonl/object.jsonl"
|
|
34
|
+
jsonl_ds = DataChain.from_json(uri, format="jsonl", anon="True")
|
|
35
|
+
jsonl_ds.show()
|
|
36
|
+
|
|
37
|
+
# Dynamic JSON schema from 200 OpenImage json-pairs with validation errors
|
|
38
|
+
uri = "gs://datachain-demo/openimages-v6-test-jsonpairs/*json"
|
|
39
|
+
schema_uri = (
|
|
40
|
+
"gs://datachain-demo/openimages-v6-test-jsonpairs/08392c290ecc9d2a.json"
|
|
41
|
+
)
|
|
42
|
+
json_pairs_ds = DataChain.from_json(
|
|
43
|
+
uri, schema_from=schema_uri, jmespath="@", model_name="OpenImage", anon="True"
|
|
44
|
+
)
|
|
45
|
+
json_pairs_ds.show()
|
|
46
|
+
|
|
47
|
+
uri = "gs://datachain-demo/coco2017/annotations_captions/"
|
|
48
|
+
|
|
49
|
+
# Print JSON schema in Pydantic format from main COCO annotation
|
|
50
|
+
chain = DataChain.from_storage(uri, anon="True").filter(
|
|
51
|
+
C("file.path").glob("*.json")
|
|
52
|
+
)
|
|
53
|
+
file = next(chain.limit(1).collect("file"))
|
|
54
|
+
print(gen_datamodel_code(file, jmespath="@", model_name="Coco"))
|
|
55
|
+
|
|
56
|
+
# Static JSON schema test parsing 3/7 objects
|
|
57
|
+
static_json_ds = DataChain.from_json(
|
|
58
|
+
uri, jmespath="licenses", spec=LicenseFeature, nrows=3, anon="True"
|
|
59
|
+
)
|
|
60
|
+
static_json_ds.show()
|
|
61
|
+
|
|
62
|
+
# Dynamic JSON schema test parsing 5K objects
|
|
63
|
+
dynamic_json_ds = DataChain.from_json(uri, jmespath="images", anon="True")
|
|
64
|
+
print(dynamic_json_ds.to_pandas())
|
|
65
|
+
|
|
66
|
+
# Static CSV with header schema test parsing 3.5K objects
|
|
67
|
+
uri = "gs://datachain-demo/chatbot-csv/"
|
|
68
|
+
static_csv_ds = DataChain.from_csv(
|
|
69
|
+
uri, output=ChatDialog, object_name="chat", anon="True"
|
|
70
|
+
)
|
|
71
|
+
static_csv_ds.print_schema()
|
|
72
|
+
static_csv_ds.show()
|
|
73
|
+
|
|
74
|
+
# Dynamic CSV with header schema test parsing 3/3M objects
|
|
75
|
+
uri = "gs://datachain-demo/laion-aesthetics-csv/laion_aesthetics_1024_33M_1.csv"
|
|
76
|
+
dynamic_csv_ds = DataChain.from_csv(uri, object_name="laion", nrows=3, anon="True")
|
|
77
|
+
dynamic_csv_ds.print_schema()
|
|
78
|
+
dynamic_csv_ds.show()
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
if __name__ == "__main__":
|
|
82
|
+
main()
|
|
@@ -48,7 +48,8 @@ dependencies = [
|
|
|
48
48
|
"iterative-telemetry>=0.0.9",
|
|
49
49
|
"platformdirs",
|
|
50
50
|
"dvc-studio-client>=0.21,<1",
|
|
51
|
-
"tabulate"
|
|
51
|
+
"tabulate",
|
|
52
|
+
"websockets"
|
|
52
53
|
]
|
|
53
54
|
|
|
54
55
|
[project.optional-dependencies]
|
|
@@ -95,7 +96,7 @@ tests = [
|
|
|
95
96
|
]
|
|
96
97
|
dev = [
|
|
97
98
|
"datachain[docs,tests]",
|
|
98
|
-
"mypy==1.
|
|
99
|
+
"mypy==1.14.0",
|
|
99
100
|
"types-python-dateutil",
|
|
100
101
|
"types-pytz",
|
|
101
102
|
"types-PyYAML",
|
|
@@ -111,7 +112,7 @@ examples = [
|
|
|
111
112
|
"pdfplumber==0.11.4",
|
|
112
113
|
"huggingface_hub[hf_transfer]",
|
|
113
114
|
"onnx==1.16.1",
|
|
114
|
-
"ultralytics==8.3.
|
|
115
|
+
"ultralytics==8.3.53"
|
|
115
116
|
]
|
|
116
117
|
|
|
117
118
|
[project.urls]
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
|
-
import math
|
|
5
4
|
import os
|
|
6
5
|
import os.path
|
|
7
6
|
import posixpath
|
|
@@ -13,7 +12,6 @@ from collections.abc import Iterable, Iterator, Mapping, Sequence
|
|
|
13
12
|
from copy import copy
|
|
14
13
|
from dataclasses import dataclass
|
|
15
14
|
from functools import cached_property, reduce
|
|
16
|
-
from random import shuffle
|
|
17
15
|
from threading import Thread
|
|
18
16
|
from typing import (
|
|
19
17
|
IO,
|
|
@@ -54,15 +52,12 @@ from datachain.error import (
|
|
|
54
52
|
QueryScriptCancelError,
|
|
55
53
|
QueryScriptRunError,
|
|
56
54
|
)
|
|
55
|
+
from datachain.lib.listing import get_listing
|
|
57
56
|
from datachain.node import DirType, Node, NodeWithPath
|
|
58
57
|
from datachain.nodes_thread_pool import NodesThreadPool
|
|
59
58
|
from datachain.remote.studio import StudioClient
|
|
60
59
|
from datachain.sql.types import DateTime, SQLType
|
|
61
|
-
from datachain.utils import
|
|
62
|
-
DataChainDir,
|
|
63
|
-
batched,
|
|
64
|
-
datachain_paths_join,
|
|
65
|
-
)
|
|
60
|
+
from datachain.utils import DataChainDir, datachain_paths_join
|
|
66
61
|
|
|
67
62
|
from .datasource import DataSource
|
|
68
63
|
|
|
@@ -90,7 +85,7 @@ QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE = 10
|
|
|
90
85
|
QUERY_SCRIPT_CANCELED_EXIT_CODE = 11
|
|
91
86
|
|
|
92
87
|
# dataset pull
|
|
93
|
-
PULL_DATASET_MAX_THREADS =
|
|
88
|
+
PULL_DATASET_MAX_THREADS = 5
|
|
94
89
|
PULL_DATASET_CHUNK_TIMEOUT = 3600
|
|
95
90
|
PULL_DATASET_SLEEP_INTERVAL = 0.1 # sleep time while waiting for chunk to be available
|
|
96
91
|
PULL_DATASET_CHECK_STATUS_INTERVAL = 20 # interval to check export status in Studio
|
|
@@ -130,6 +125,7 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
130
125
|
local_ds_version: int,
|
|
131
126
|
schema: dict[str, Union[SQLType, type[SQLType]]],
|
|
132
127
|
max_threads: int = PULL_DATASET_MAX_THREADS,
|
|
128
|
+
progress_bar=None,
|
|
133
129
|
):
|
|
134
130
|
super().__init__(max_threads)
|
|
135
131
|
self._check_dependencies()
|
|
@@ -142,6 +138,7 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
142
138
|
self.schema = schema
|
|
143
139
|
self.last_status_check: Optional[float] = None
|
|
144
140
|
self.studio_client = StudioClient()
|
|
141
|
+
self.progress_bar = progress_bar
|
|
145
142
|
|
|
146
143
|
def done_task(self, done):
|
|
147
144
|
for task in done:
|
|
@@ -198,6 +195,20 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
198
195
|
for c in [c for c, t in self.schema.items() if t == DateTime]:
|
|
199
196
|
df[c] = pd.to_datetime(df[c], unit="s")
|
|
200
197
|
|
|
198
|
+
# id will be autogenerated in DB
|
|
199
|
+
return df.drop("sys__id", axis=1)
|
|
200
|
+
|
|
201
|
+
def get_parquet_content(self, url: str):
|
|
202
|
+
while True:
|
|
203
|
+
if self.should_check_for_status():
|
|
204
|
+
self.check_for_status()
|
|
205
|
+
r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
|
|
206
|
+
if r.status_code == 404:
|
|
207
|
+
time.sleep(PULL_DATASET_SLEEP_INTERVAL)
|
|
208
|
+
continue
|
|
209
|
+
r.raise_for_status()
|
|
210
|
+
return r.content
|
|
211
|
+
|
|
201
212
|
def do_task(self, urls):
|
|
202
213
|
import lz4.frame
|
|
203
214
|
import pandas as pd
|
|
@@ -207,31 +218,22 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
207
218
|
local_ds = metastore.get_dataset(self.local_ds_name)
|
|
208
219
|
|
|
209
220
|
urls = list(urls)
|
|
210
|
-
while urls:
|
|
211
|
-
for url in urls:
|
|
212
|
-
if self.should_check_for_status():
|
|
213
|
-
self.check_for_status()
|
|
214
221
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
# moving to the next url
|
|
219
|
-
continue
|
|
222
|
+
for url in urls:
|
|
223
|
+
if self.should_check_for_status():
|
|
224
|
+
self.check_for_status()
|
|
220
225
|
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
self.fix_columns(df)
|
|
226
|
-
|
|
227
|
-
# id will be autogenerated in DB
|
|
228
|
-
df = df.drop("sys__id", axis=1)
|
|
226
|
+
df = pd.read_parquet(
|
|
227
|
+
io.BytesIO(lz4.frame.decompress(self.get_parquet_content(url)))
|
|
228
|
+
)
|
|
229
|
+
df = self.fix_columns(df)
|
|
229
230
|
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
231
|
+
inserted = warehouse.insert_dataset_rows(
|
|
232
|
+
df, local_ds, self.local_ds_version
|
|
233
|
+
)
|
|
234
|
+
self.increase_counter(inserted) # type: ignore [arg-type]
|
|
235
|
+
# sometimes progress bar doesn't get updated so manually updating it
|
|
236
|
+
self.update_progress_bar(self.progress_bar)
|
|
235
237
|
|
|
236
238
|
|
|
237
239
|
@dataclass
|
|
@@ -598,7 +600,7 @@ class Catalog:
|
|
|
598
600
|
source, session=self.session, update=update, object_name=object_name
|
|
599
601
|
)
|
|
600
602
|
|
|
601
|
-
list_ds_name, list_uri, list_path, _ =
|
|
603
|
+
list_ds_name, list_uri, list_path, _ = get_listing(
|
|
602
604
|
source, self.session, update=update
|
|
603
605
|
)
|
|
604
606
|
|
|
@@ -696,11 +698,9 @@ class Catalog:
|
|
|
696
698
|
)
|
|
697
699
|
indexed_sources = []
|
|
698
700
|
for source in dataset_sources:
|
|
699
|
-
from datachain.lib.dc import DataChain
|
|
700
|
-
|
|
701
701
|
client = self.get_client(source, **client_config)
|
|
702
702
|
uri = client.uri
|
|
703
|
-
dataset_name, _, _, _ =
|
|
703
|
+
dataset_name, _, _, _ = get_listing(uri, self.session)
|
|
704
704
|
listing = Listing(
|
|
705
705
|
self.metastore.clone(),
|
|
706
706
|
self.warehouse.clone(),
|
|
@@ -1291,13 +1291,13 @@ class Catalog:
|
|
|
1291
1291
|
for source in data_sources: # type: ignore [union-attr]
|
|
1292
1292
|
yield source, source.ls(fields)
|
|
1293
1293
|
|
|
1294
|
-
def pull_dataset( # noqa: PLR0915
|
|
1294
|
+
def pull_dataset( # noqa: C901, PLR0915
|
|
1295
1295
|
self,
|
|
1296
1296
|
remote_ds_uri: str,
|
|
1297
1297
|
output: Optional[str] = None,
|
|
1298
1298
|
local_ds_name: Optional[str] = None,
|
|
1299
1299
|
local_ds_version: Optional[int] = None,
|
|
1300
|
-
|
|
1300
|
+
cp: bool = False,
|
|
1301
1301
|
force: bool = False,
|
|
1302
1302
|
edatachain: bool = False,
|
|
1303
1303
|
edatachain_file: Optional[str] = None,
|
|
@@ -1305,7 +1305,7 @@ class Catalog:
|
|
|
1305
1305
|
client_config=None,
|
|
1306
1306
|
) -> None:
|
|
1307
1307
|
def _instantiate(ds_uri: str) -> None:
|
|
1308
|
-
if
|
|
1308
|
+
if not cp:
|
|
1309
1309
|
return
|
|
1310
1310
|
assert output
|
|
1311
1311
|
self.cp(
|
|
@@ -1318,7 +1318,7 @@ class Catalog:
|
|
|
1318
1318
|
)
|
|
1319
1319
|
print(f"Dataset {ds_uri} instantiated locally to {output}")
|
|
1320
1320
|
|
|
1321
|
-
if
|
|
1321
|
+
if cp and not output:
|
|
1322
1322
|
raise ValueError("Please provide output directory for instantiation")
|
|
1323
1323
|
|
|
1324
1324
|
studio_client = StudioClient()
|
|
@@ -1417,12 +1417,26 @@ class Catalog:
|
|
|
1417
1417
|
signed_urls = export_response.data
|
|
1418
1418
|
|
|
1419
1419
|
if signed_urls:
|
|
1420
|
-
shuffle(signed_urls)
|
|
1421
|
-
|
|
1422
1420
|
with (
|
|
1423
1421
|
self.metastore.clone() as metastore,
|
|
1424
1422
|
self.warehouse.clone() as warehouse,
|
|
1425
1423
|
):
|
|
1424
|
+
|
|
1425
|
+
def batch(urls):
|
|
1426
|
+
"""
|
|
1427
|
+
Batching urls in a way that fetching is most efficient as
|
|
1428
|
+
urls with lower id will be created first. Because that, we
|
|
1429
|
+
are making sure all threads are pulling most recent urls
|
|
1430
|
+
from beginning
|
|
1431
|
+
"""
|
|
1432
|
+
res = [[] for i in range(PULL_DATASET_MAX_THREADS)]
|
|
1433
|
+
current_worker = 0
|
|
1434
|
+
for url in signed_urls:
|
|
1435
|
+
res[current_worker].append(url)
|
|
1436
|
+
current_worker = (current_worker + 1) % PULL_DATASET_MAX_THREADS
|
|
1437
|
+
|
|
1438
|
+
return res
|
|
1439
|
+
|
|
1426
1440
|
rows_fetcher = DatasetRowsFetcher(
|
|
1427
1441
|
metastore,
|
|
1428
1442
|
warehouse,
|
|
@@ -1431,14 +1445,11 @@ class Catalog:
|
|
|
1431
1445
|
local_ds_name,
|
|
1432
1446
|
local_ds_version,
|
|
1433
1447
|
schema,
|
|
1448
|
+
progress_bar=dataset_save_progress_bar,
|
|
1434
1449
|
)
|
|
1435
1450
|
try:
|
|
1436
1451
|
rows_fetcher.run(
|
|
1437
|
-
|
|
1438
|
-
signed_urls,
|
|
1439
|
-
math.ceil(len(signed_urls) / PULL_DATASET_MAX_THREADS),
|
|
1440
|
-
),
|
|
1441
|
-
dataset_save_progress_bar,
|
|
1452
|
+
iter(batch(signed_urls)), dataset_save_progress_bar
|
|
1442
1453
|
)
|
|
1443
1454
|
except:
|
|
1444
1455
|
self.remove_dataset(local_ds_name, local_ds_version)
|
|
@@ -294,6 +294,28 @@ def add_studio_parser(subparsers, parent_parser) -> None:
|
|
|
294
294
|
help="Python package requirement. Can be specified multiple times.",
|
|
295
295
|
)
|
|
296
296
|
|
|
297
|
+
studio_cancel_help = "Cancel a job in Studio"
|
|
298
|
+
studio_cancel_description = "This command cancels a job in Studio."
|
|
299
|
+
|
|
300
|
+
studio_cancel_parser = studio_subparser.add_parser(
|
|
301
|
+
"cancel",
|
|
302
|
+
parents=[parent_parser],
|
|
303
|
+
description=studio_cancel_description,
|
|
304
|
+
help=studio_cancel_help,
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
studio_cancel_parser.add_argument(
|
|
308
|
+
"job_id",
|
|
309
|
+
action="store",
|
|
310
|
+
help="The job ID to cancel.",
|
|
311
|
+
)
|
|
312
|
+
studio_cancel_parser.add_argument(
|
|
313
|
+
"--team",
|
|
314
|
+
action="store",
|
|
315
|
+
default=None,
|
|
316
|
+
help="The team to cancel a job for. By default, it will use team from config.",
|
|
317
|
+
)
|
|
318
|
+
|
|
297
319
|
|
|
298
320
|
def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
299
321
|
try:
|
|
@@ -457,10 +479,10 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
457
479
|
help="Copy directories recursively",
|
|
458
480
|
)
|
|
459
481
|
parse_pull.add_argument(
|
|
460
|
-
"--
|
|
482
|
+
"--cp",
|
|
461
483
|
default=False,
|
|
462
484
|
action="store_true",
|
|
463
|
-
help="
|
|
485
|
+
help="Copy actual files after pulling remote dataset into local DB",
|
|
464
486
|
)
|
|
465
487
|
parse_pull.add_argument(
|
|
466
488
|
"--edatachain",
|
|
@@ -1300,7 +1322,7 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
1300
1322
|
args.output,
|
|
1301
1323
|
local_ds_name=args.local_name,
|
|
1302
1324
|
local_ds_version=args.local_version,
|
|
1303
|
-
|
|
1325
|
+
cp=args.cp,
|
|
1304
1326
|
force=bool(args.force),
|
|
1305
1327
|
edatachain=args.edatachain,
|
|
1306
1328
|
edatachain_file=args.edatachain_file,
|
|
@@ -32,6 +32,15 @@ class GCSClient(Client):
|
|
|
32
32
|
|
|
33
33
|
return cast(GCSFileSystem, super().create_fs(**kwargs))
|
|
34
34
|
|
|
35
|
+
def url(self, path: str, expires: int = 3600, **kwargs) -> str:
|
|
36
|
+
try:
|
|
37
|
+
return self.fs.sign(self.get_full_path(path), expiration=expires, **kwargs)
|
|
38
|
+
except AttributeError as exc:
|
|
39
|
+
is_anon = self.fs.storage_options.get("token") == "anon"
|
|
40
|
+
if is_anon and "you need a private key to sign credentials" in str(exc):
|
|
41
|
+
return f"https://storage.googleapis.com/{self.name}/{path}"
|
|
42
|
+
raise
|
|
43
|
+
|
|
35
44
|
@staticmethod
|
|
36
45
|
def parse_timestamp(timestamp: str) -> datetime:
|
|
37
46
|
"""
|
|
@@ -209,10 +209,12 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
209
209
|
|
|
210
210
|
@retry_sqlite_locks
|
|
211
211
|
def executemany(
|
|
212
|
-
self, query, params, cursor: Optional[sqlite3.Cursor] = None
|
|
212
|
+
self, query, params, cursor: Optional[sqlite3.Cursor] = None, conn=None
|
|
213
213
|
) -> sqlite3.Cursor:
|
|
214
214
|
if cursor:
|
|
215
215
|
return cursor.executemany(self.compile(query).string, params)
|
|
216
|
+
if conn:
|
|
217
|
+
return conn.executemany(self.compile(query).string, params)
|
|
216
218
|
return self.db.executemany(self.compile(query).string, params)
|
|
217
219
|
|
|
218
220
|
@retry_sqlite_locks
|
|
@@ -222,7 +224,14 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
222
224
|
return self.db.execute(sql, parameters)
|
|
223
225
|
|
|
224
226
|
def insert_dataframe(self, table_name: str, df) -> int:
|
|
225
|
-
return df.to_sql(
|
|
227
|
+
return df.to_sql(
|
|
228
|
+
table_name,
|
|
229
|
+
self.db,
|
|
230
|
+
if_exists="append",
|
|
231
|
+
index=False,
|
|
232
|
+
method="multi",
|
|
233
|
+
chunksize=1000,
|
|
234
|
+
)
|
|
226
235
|
|
|
227
236
|
def cursor(self, factory=None):
|
|
228
237
|
if factory is None:
|
|
@@ -545,10 +554,15 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
545
554
|
rows = list(rows)
|
|
546
555
|
if not rows:
|
|
547
556
|
return
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
557
|
+
|
|
558
|
+
with self.db.transaction() as conn:
|
|
559
|
+
# transactions speeds up inserts significantly as there is no separate
|
|
560
|
+
# transaction created for each insert row
|
|
561
|
+
self.db.executemany(
|
|
562
|
+
table.insert().values({f: bindparam(f) for f in rows[0]}),
|
|
563
|
+
rows,
|
|
564
|
+
conn=conn,
|
|
565
|
+
)
|
|
552
566
|
|
|
553
567
|
def insert_dataset_rows(self, df, dataset: DatasetRecord, version: int) -> int:
|
|
554
568
|
dr = self.dataset_rows(dataset, version)
|