datachain 0.3.17__tar.gz → 0.3.18__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.3.17/src/datachain.egg-info → datachain-0.3.18}/PKG-INFO +5 -3
- datachain-0.3.18/examples/llm_and_nlp/unstructured-embeddings-gen.py +76 -0
- datachain-0.3.17/examples/llm_and_nlp/unstructured-text.py → datachain-0.3.18/examples/llm_and_nlp/unstructured-summary-map.py +7 -3
- {datachain-0.3.17 → datachain-0.3.18}/examples/multimodal/hf_pipeline.py +7 -1
- {datachain-0.3.17 → datachain-0.3.18}/examples/multimodal/openai_image_desc_lib.py +0 -2
- {datachain-0.3.17 → datachain-0.3.18}/pyproject.toml +6 -4
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/cache.py +14 -55
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/catalog/catalog.py +8 -18
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/cli.py +7 -1
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/client/fsspec.py +29 -63
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/client/local.py +2 -3
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/arrow.py +2 -1
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/dc.py +4 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/file.py +41 -23
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/listing.py +2 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/listing.py +4 -4
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/node.py +6 -6
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/nodes_fetcher.py +12 -5
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/nodes_thread_pool.py +1 -1
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/progress.py +2 -12
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/query/dataset.py +6 -18
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/query/dispatch.py +2 -15
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/query/schema.py +25 -24
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/query/udf.py +0 -106
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/types.py +4 -2
- datachain-0.3.18/src/datachain/telemetry.py +37 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/utils.py +11 -0
- {datachain-0.3.17 → datachain-0.3.18/src/datachain.egg-info}/PKG-INFO +5 -3
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain.egg-info/SOURCES.txt +4 -1
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain.egg-info/requires.txt +4 -2
- {datachain-0.3.17 → datachain-0.3.18}/tests/conftest.py +5 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/func/test_datachain.py +3 -4
- {datachain-0.3.17 → datachain-0.3.18}/tests/func/test_datasets.py +6 -8
- datachain-0.3.18/tests/test_telemetry.py +20 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_file.py +3 -26
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_cache.py +9 -4
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_data_storage.py +1 -1
- {datachain-0.3.17 → datachain-0.3.18}/.cruft.json +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/.gitattributes +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/.github/codecov.yaml +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/.github/dependabot.yml +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/.github/workflows/release.yml +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/.github/workflows/tests.yml +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/.gitignore +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/.pre-commit-config.yaml +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/CONTRIBUTING.rst +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/LICENSE +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/README.rst +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/docs/assets/datachain.svg +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/docs/assets/flowchart.png +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/docs/index.md +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/docs/references/datachain.md +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/docs/references/datatype.md +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/docs/references/file.md +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/docs/references/index.md +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/docs/references/sql.md +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/docs/references/torch.md +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/docs/references/udf.md +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/examples/multimodal/wds.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/mkdocs.yml +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/noxfile.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/setup.cfg +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/__main__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/asyn.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/cli_utils.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/client/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/client/azure.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/client/gcs.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/client/hf.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/client/s3.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/config.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/data_storage/id_generator.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/dataset.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/error.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/job.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/clip.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/hf.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/image.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/settings.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/tar.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/text.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/udf.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/utils.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/py.typed +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/query/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/query/batch.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/query/metrics.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/query/params.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/query/queue.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/query/session.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/remote/studio.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/sql/utils.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/storage.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/data.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/examples/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/examples/test_examples.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/examples/wds_data.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/func/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/func/test_catalog.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/func/test_client.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/func/test_listing.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/func/test_ls.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/func/test_metrics.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/func/test_pull.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/func/test_pytorch.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/func/test_query.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/scripts/feature_class.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/test_cli_e2e.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/test_query_e2e.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_asyn.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_catalog.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_client.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_dataset.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_id_generator.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_listing.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_metastore.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_query_params.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_serializer.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_session.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_storage.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_utils.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.3.17 → datachain-0.3.18}/tests/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.18
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -44,6 +44,7 @@ Requires-Dist: Pillow<11,>=10.0.0
|
|
|
44
44
|
Requires-Dist: msgpack<2,>=1.0.4
|
|
45
45
|
Requires-Dist: psutil
|
|
46
46
|
Requires-Dist: huggingface_hub
|
|
47
|
+
Requires-Dist: iterative-telemetry>=0.0.9
|
|
47
48
|
Provides-Extra: docs
|
|
48
49
|
Requires-Dist: mkdocs>=1.5.2; extra == "docs"
|
|
49
50
|
Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
|
|
@@ -69,7 +70,7 @@ Requires-Dist: pytest<9,>=8; extra == "tests"
|
|
|
69
70
|
Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
|
|
70
71
|
Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
|
|
71
72
|
Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
|
|
72
|
-
Requires-Dist: pytest-servers[all]>=0.5.
|
|
73
|
+
Requires-Dist: pytest-servers[all]>=0.5.7; extra == "tests"
|
|
73
74
|
Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
|
|
74
75
|
Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
|
|
75
76
|
Requires-Dist: virtualenv; extra == "tests"
|
|
@@ -91,9 +92,10 @@ Requires-Dist: datachain[tests]; extra == "examples"
|
|
|
91
92
|
Requires-Dist: numpy<2,>=1; extra == "examples"
|
|
92
93
|
Requires-Dist: defusedxml; extra == "examples"
|
|
93
94
|
Requires-Dist: accelerate; extra == "examples"
|
|
94
|
-
Requires-Dist: unstructured[pdf]; extra == "examples"
|
|
95
|
+
Requires-Dist: unstructured[embed-huggingface,pdf]; extra == "examples"
|
|
95
96
|
Requires-Dist: pdfplumber==0.11.4; extra == "examples"
|
|
96
97
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
98
|
+
Requires-Dist: onnx==1.16.1; extra == "examples"
|
|
97
99
|
|
|
98
100
|
================
|
|
99
101
|
|logo| DataChain
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""
|
|
2
|
+
To install the required dependencies:
|
|
3
|
+
|
|
4
|
+
pip install datachain[examples]
|
|
5
|
+
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from collections.abc import Iterator
|
|
9
|
+
|
|
10
|
+
from unstructured.cleaners.core import (
|
|
11
|
+
clean,
|
|
12
|
+
group_broken_paragraphs,
|
|
13
|
+
replace_unicode_quotes,
|
|
14
|
+
)
|
|
15
|
+
from unstructured.embed.huggingface import (
|
|
16
|
+
HuggingFaceEmbeddingConfig,
|
|
17
|
+
HuggingFaceEmbeddingEncoder,
|
|
18
|
+
)
|
|
19
|
+
from unstructured.partition.pdf import partition_pdf
|
|
20
|
+
|
|
21
|
+
from datachain import C, DataChain, DataModel, File
|
|
22
|
+
|
|
23
|
+
source = "gs://datachain-demo/neurips/1987/"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# Define the output as a DataModel class
|
|
27
|
+
class Chunk(DataModel):
|
|
28
|
+
key: str
|
|
29
|
+
text: str
|
|
30
|
+
embeddings: list[float]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Define embedding encoder
|
|
34
|
+
|
|
35
|
+
embedding_encoder = HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig())
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# Use signatures to define UDF input/output
|
|
39
|
+
# these can be pydantic model or regular Python types
|
|
40
|
+
def process_pdf(file: File) -> Iterator[Chunk]:
|
|
41
|
+
# Ingest the file
|
|
42
|
+
with file.open() as f:
|
|
43
|
+
chunks = partition_pdf(file=f, chunking_strategy="by_title", strategy="fast")
|
|
44
|
+
|
|
45
|
+
# Clean the chunks and add new columns
|
|
46
|
+
for chunk in chunks:
|
|
47
|
+
chunk.apply(
|
|
48
|
+
lambda text: clean(
|
|
49
|
+
text, bullets=True, extra_whitespace=True, trailing_punctuation=True
|
|
50
|
+
)
|
|
51
|
+
)
|
|
52
|
+
chunk.apply(replace_unicode_quotes)
|
|
53
|
+
chunk.apply(group_broken_paragraphs)
|
|
54
|
+
|
|
55
|
+
# create embeddings
|
|
56
|
+
chunks_embedded = embedding_encoder.embed_documents(chunks)
|
|
57
|
+
|
|
58
|
+
# Add new rows to DataChain
|
|
59
|
+
for chunk in chunks_embedded:
|
|
60
|
+
yield Chunk(
|
|
61
|
+
key=file.path,
|
|
62
|
+
text=chunk.text,
|
|
63
|
+
embeddings=chunk.embeddings,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
dc = (
|
|
68
|
+
DataChain.from_storage(source)
|
|
69
|
+
.settings(parallel=-1)
|
|
70
|
+
.filter(C.file.path.glob("*.pdf"))
|
|
71
|
+
.gen(document=process_pdf)
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
dc.save("embedded-documents")
|
|
75
|
+
|
|
76
|
+
DataChain.from_dataset("embedded-documents").show()
|
|
@@ -46,7 +46,8 @@ dependencies = [
|
|
|
46
46
|
"Pillow>=10.0.0,<11",
|
|
47
47
|
"msgpack>=1.0.4,<2",
|
|
48
48
|
"psutil",
|
|
49
|
-
"huggingface_hub"
|
|
49
|
+
"huggingface_hub",
|
|
50
|
+
"iterative-telemetry>=0.0.9"
|
|
50
51
|
]
|
|
51
52
|
|
|
52
53
|
[project.optional-dependencies]
|
|
@@ -80,7 +81,7 @@ tests = [
|
|
|
80
81
|
"pytest-sugar>=0.9.6",
|
|
81
82
|
"pytest-cov>=4.1.0",
|
|
82
83
|
"pytest-mock>=3.12.0",
|
|
83
|
-
"pytest-servers[all]>=0.5.
|
|
84
|
+
"pytest-servers[all]>=0.5.7",
|
|
84
85
|
"pytest-benchmark[histogram]",
|
|
85
86
|
"pytest-xdist>=3.3.1",
|
|
86
87
|
"virtualenv",
|
|
@@ -104,9 +105,10 @@ examples = [
|
|
|
104
105
|
"numpy>=1,<2",
|
|
105
106
|
"defusedxml",
|
|
106
107
|
"accelerate",
|
|
107
|
-
"unstructured[pdf]",
|
|
108
|
+
"unstructured[pdf, embed-huggingface]",
|
|
108
109
|
"pdfplumber==0.11.4",
|
|
109
|
-
"huggingface_hub[hf_transfer]"
|
|
110
|
+
"huggingface_hub[hf_transfer]",
|
|
111
|
+
"onnx==1.16.1"
|
|
110
112
|
]
|
|
111
113
|
|
|
112
114
|
[project.urls]
|
|
@@ -1,56 +1,15 @@
|
|
|
1
|
-
import hashlib
|
|
2
|
-
import json
|
|
3
1
|
import os
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
from functools import partial
|
|
6
2
|
from typing import TYPE_CHECKING, Optional
|
|
7
3
|
|
|
8
|
-
import attrs
|
|
9
4
|
from dvc_data.hashfile.db.local import LocalHashFileDB
|
|
10
5
|
from dvc_objects.fs.local import LocalFileSystem
|
|
11
6
|
from fsspec.callbacks import Callback, TqdmCallback
|
|
12
7
|
|
|
13
|
-
from datachain.utils import TIME_ZERO
|
|
14
|
-
|
|
15
8
|
from .progress import Tqdm
|
|
16
9
|
|
|
17
10
|
if TYPE_CHECKING:
|
|
18
11
|
from datachain.client import Client
|
|
19
|
-
from datachain.
|
|
20
|
-
|
|
21
|
-
sha256 = partial(hashlib.sha256, usedforsecurity=False)
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
@attrs.frozen
|
|
25
|
-
class UniqueId:
|
|
26
|
-
storage: "StorageURI"
|
|
27
|
-
path: str
|
|
28
|
-
size: int
|
|
29
|
-
etag: str
|
|
30
|
-
version: str = ""
|
|
31
|
-
is_latest: bool = True
|
|
32
|
-
location: Optional[str] = None
|
|
33
|
-
last_modified: datetime = TIME_ZERO
|
|
34
|
-
|
|
35
|
-
def get_parsed_location(self) -> Optional[dict]:
|
|
36
|
-
if not self.location:
|
|
37
|
-
return None
|
|
38
|
-
|
|
39
|
-
loc_stack = (
|
|
40
|
-
json.loads(self.location)
|
|
41
|
-
if isinstance(self.location, str)
|
|
42
|
-
else self.location
|
|
43
|
-
)
|
|
44
|
-
if len(loc_stack) > 1:
|
|
45
|
-
raise NotImplementedError("Nested v-objects are not supported yet.")
|
|
46
|
-
|
|
47
|
-
return loc_stack[0]
|
|
48
|
-
|
|
49
|
-
def get_hash(self) -> str:
|
|
50
|
-
fingerprint = f"{self.storage}/{self.path}/{self.version}/{self.etag}"
|
|
51
|
-
if self.location:
|
|
52
|
-
fingerprint += f"/{self.location}"
|
|
53
|
-
return sha256(fingerprint.encode()).hexdigest()
|
|
12
|
+
from datachain.lib.file import File
|
|
54
13
|
|
|
55
14
|
|
|
56
15
|
def try_scandir(path):
|
|
@@ -77,30 +36,30 @@ class DataChainCache:
|
|
|
77
36
|
def tmp_dir(self):
|
|
78
37
|
return self.odb.tmp_dir
|
|
79
38
|
|
|
80
|
-
def get_path(self,
|
|
81
|
-
if self.contains(
|
|
82
|
-
return self.path_from_checksum(
|
|
39
|
+
def get_path(self, file: "File") -> Optional[str]:
|
|
40
|
+
if self.contains(file):
|
|
41
|
+
return self.path_from_checksum(file.get_hash())
|
|
83
42
|
return None
|
|
84
43
|
|
|
85
|
-
def contains(self,
|
|
86
|
-
return self.odb.exists(
|
|
44
|
+
def contains(self, file: "File") -> bool:
|
|
45
|
+
return self.odb.exists(file.get_hash())
|
|
87
46
|
|
|
88
47
|
def path_from_checksum(self, checksum: str) -> str:
|
|
89
48
|
assert checksum
|
|
90
49
|
return self.odb.oid_to_path(checksum)
|
|
91
50
|
|
|
92
|
-
def remove(self,
|
|
93
|
-
self.odb.delete(
|
|
51
|
+
def remove(self, file: "File") -> None:
|
|
52
|
+
self.odb.delete(file.get_hash())
|
|
94
53
|
|
|
95
54
|
async def download(
|
|
96
|
-
self,
|
|
55
|
+
self, file: "File", client: "Client", callback: Optional[Callback] = None
|
|
97
56
|
) -> None:
|
|
98
|
-
from_path = f"{
|
|
57
|
+
from_path = f"{file.source}/{file.path}"
|
|
99
58
|
from dvc_objects.fs.utils import tmp_fname
|
|
100
59
|
|
|
101
60
|
odb_fs = self.odb.fs
|
|
102
61
|
tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname()) # type: ignore[arg-type]
|
|
103
|
-
size =
|
|
62
|
+
size = file.size
|
|
104
63
|
if size < 0:
|
|
105
64
|
size = await client.get_size(from_path)
|
|
106
65
|
cb = callback or TqdmCallback(
|
|
@@ -115,13 +74,13 @@ class DataChainCache:
|
|
|
115
74
|
cb.close()
|
|
116
75
|
|
|
117
76
|
try:
|
|
118
|
-
oid =
|
|
77
|
+
oid = file.get_hash()
|
|
119
78
|
self.odb.add(tmp_info, self.odb.fs, oid)
|
|
120
79
|
finally:
|
|
121
80
|
os.unlink(tmp_info)
|
|
122
81
|
|
|
123
|
-
def store_data(self,
|
|
124
|
-
checksum =
|
|
82
|
+
def store_data(self, file: "File", contents: bytes) -> None:
|
|
83
|
+
checksum = file.get_hash()
|
|
125
84
|
dst = self.path_from_checksum(checksum)
|
|
126
85
|
if not os.path.exists(dst):
|
|
127
86
|
# Create the file only if it's not already in cache
|
|
@@ -34,7 +34,7 @@ import yaml
|
|
|
34
34
|
from sqlalchemy import Column
|
|
35
35
|
from tqdm import tqdm
|
|
36
36
|
|
|
37
|
-
from datachain.cache import DataChainCache
|
|
37
|
+
from datachain.cache import DataChainCache
|
|
38
38
|
from datachain.client import Client
|
|
39
39
|
from datachain.config import get_remote_config, read_config
|
|
40
40
|
from datachain.dataset import (
|
|
@@ -619,13 +619,13 @@ class Catalog:
|
|
|
619
619
|
code_ast.body[-1:] = new_expressions
|
|
620
620
|
return code_ast
|
|
621
621
|
|
|
622
|
-
def get_client(self, uri:
|
|
622
|
+
def get_client(self, uri: str, **config: Any) -> Client:
|
|
623
623
|
"""
|
|
624
624
|
Return the client corresponding to the given source `uri`.
|
|
625
625
|
"""
|
|
626
626
|
config = config or self.client_config
|
|
627
627
|
cls = Client.get_implementation(uri)
|
|
628
|
-
return cls.from_source(uri, self.cache, **config)
|
|
628
|
+
return cls.from_source(StorageURI(uri), self.cache, **config)
|
|
629
629
|
|
|
630
630
|
def enlist_source(
|
|
631
631
|
self,
|
|
@@ -1431,7 +1431,7 @@ class Catalog:
|
|
|
1431
1431
|
|
|
1432
1432
|
def get_file_signals(
|
|
1433
1433
|
self, dataset_name: str, dataset_version: int, row: RowDict
|
|
1434
|
-
) -> Optional[
|
|
1434
|
+
) -> Optional[RowDict]:
|
|
1435
1435
|
"""
|
|
1436
1436
|
Function that returns file signals from dataset row.
|
|
1437
1437
|
Note that signal names are without prefix, so if there was 'laion__file__source'
|
|
@@ -1448,7 +1448,7 @@ class Catalog:
|
|
|
1448
1448
|
|
|
1449
1449
|
version = self.get_dataset(dataset_name).get_version(dataset_version)
|
|
1450
1450
|
|
|
1451
|
-
file_signals_values =
|
|
1451
|
+
file_signals_values = RowDict()
|
|
1452
1452
|
|
|
1453
1453
|
schema = SignalSchema.deserialize(version.feature_schema)
|
|
1454
1454
|
for file_signals in schema.get_signals(File):
|
|
@@ -1476,6 +1476,8 @@ class Catalog:
|
|
|
1476
1476
|
use_cache: bool = True,
|
|
1477
1477
|
**config: Any,
|
|
1478
1478
|
):
|
|
1479
|
+
from datachain.lib.file import File
|
|
1480
|
+
|
|
1479
1481
|
file_signals = self.get_file_signals(dataset_name, dataset_version, row)
|
|
1480
1482
|
if not file_signals:
|
|
1481
1483
|
raise RuntimeError("Cannot open object without file signals")
|
|
@@ -1483,22 +1485,10 @@ class Catalog:
|
|
|
1483
1485
|
config = config or self.client_config
|
|
1484
1486
|
client = self.get_client(file_signals["source"], **config)
|
|
1485
1487
|
return client.open_object(
|
|
1486
|
-
|
|
1488
|
+
File._from_row(file_signals),
|
|
1487
1489
|
use_cache=use_cache,
|
|
1488
1490
|
)
|
|
1489
1491
|
|
|
1490
|
-
def _get_row_uid(self, row: RowDict) -> UniqueId:
|
|
1491
|
-
return UniqueId(
|
|
1492
|
-
row["source"],
|
|
1493
|
-
row["path"],
|
|
1494
|
-
row["size"],
|
|
1495
|
-
row["etag"],
|
|
1496
|
-
row["version"],
|
|
1497
|
-
row["is_latest"],
|
|
1498
|
-
row["location"],
|
|
1499
|
-
row["last_modified"],
|
|
1500
|
-
)
|
|
1501
|
-
|
|
1502
1492
|
def ls(
|
|
1503
1493
|
self,
|
|
1504
1494
|
sources: list[str],
|
|
@@ -15,6 +15,7 @@ import shtab
|
|
|
15
15
|
from datachain import utils
|
|
16
16
|
from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
|
|
17
17
|
from datachain.lib.dc import DataChain
|
|
18
|
+
from datachain.telemetry import telemetry
|
|
18
19
|
from datachain.utils import DataChainDir
|
|
19
20
|
|
|
20
21
|
if TYPE_CHECKING:
|
|
@@ -872,6 +873,7 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
872
873
|
# This also sets this environment variable for any subprocesses
|
|
873
874
|
os.environ["DEBUG_SHOW_SQL_QUERIES"] = "True"
|
|
874
875
|
|
|
876
|
+
error = None
|
|
875
877
|
try:
|
|
876
878
|
catalog = get_catalog(client_config=client_config)
|
|
877
879
|
if args.command == "cp":
|
|
@@ -1003,14 +1005,16 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
1003
1005
|
print(f"invalid command: {args.command}", file=sys.stderr)
|
|
1004
1006
|
return 1
|
|
1005
1007
|
return 0
|
|
1006
|
-
except BrokenPipeError:
|
|
1008
|
+
except BrokenPipeError as exc:
|
|
1007
1009
|
# Python flushes standard streams on exit; redirect remaining output
|
|
1008
1010
|
# to devnull to avoid another BrokenPipeError at shutdown
|
|
1009
1011
|
# See: https://docs.python.org/3/library/signal.html#note-on-sigpipe
|
|
1012
|
+
error = str(exc)
|
|
1010
1013
|
devnull = os.open(os.devnull, os.O_WRONLY)
|
|
1011
1014
|
os.dup2(devnull, sys.stdout.fileno())
|
|
1012
1015
|
return 141 # 128 + 13 (SIGPIPE)
|
|
1013
1016
|
except (KeyboardInterrupt, Exception) as exc:
|
|
1017
|
+
error = str(exc)
|
|
1014
1018
|
if isinstance(exc, KeyboardInterrupt):
|
|
1015
1019
|
msg = "Operation cancelled by the user"
|
|
1016
1020
|
else:
|
|
@@ -1028,3 +1032,5 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
1028
1032
|
|
|
1029
1033
|
pdb.post_mortem()
|
|
1030
1034
|
return 1
|
|
1035
|
+
finally:
|
|
1036
|
+
telemetry.send_cli_call(args.command, error=error)
|
|
@@ -3,7 +3,6 @@ import functools
|
|
|
3
3
|
import logging
|
|
4
4
|
import multiprocessing
|
|
5
5
|
import os
|
|
6
|
-
import posixpath
|
|
7
6
|
import re
|
|
8
7
|
import sys
|
|
9
8
|
from abc import ABC, abstractmethod
|
|
@@ -26,8 +25,8 @@ from fsspec.asyn import get_loop, sync
|
|
|
26
25
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
27
26
|
from tqdm import tqdm
|
|
28
27
|
|
|
29
|
-
from datachain.cache import DataChainCache
|
|
30
|
-
from datachain.client.fileslice import
|
|
28
|
+
from datachain.cache import DataChainCache
|
|
29
|
+
from datachain.client.fileslice import FileWrapper
|
|
31
30
|
from datachain.error import ClientError as DataChainClientError
|
|
32
31
|
from datachain.lib.file import File
|
|
33
32
|
from datachain.nodes_fetcher import NodesFetcher
|
|
@@ -187,8 +186,8 @@ class Client(ABC):
|
|
|
187
186
|
def url(self, path: str, expires: int = 3600, **kwargs) -> str:
|
|
188
187
|
return self.fs.sign(self.get_full_path(path), expiration=expires, **kwargs)
|
|
189
188
|
|
|
190
|
-
async def get_current_etag(self,
|
|
191
|
-
info = await self.fs._info(self.get_full_path(
|
|
189
|
+
async def get_current_etag(self, file: "File") -> str:
|
|
190
|
+
info = await self.fs._info(self.get_full_path(file.path))
|
|
192
191
|
return self.info_to_file(info, "").etag
|
|
193
192
|
|
|
194
193
|
async def get_size(self, path: str) -> int:
|
|
@@ -317,7 +316,7 @@ class Client(ABC):
|
|
|
317
316
|
|
|
318
317
|
def instantiate_object(
|
|
319
318
|
self,
|
|
320
|
-
|
|
319
|
+
file: "File",
|
|
321
320
|
dst: str,
|
|
322
321
|
progress_bar: tqdm,
|
|
323
322
|
force: bool = False,
|
|
@@ -328,10 +327,10 @@ class Client(ABC):
|
|
|
328
327
|
else:
|
|
329
328
|
progress_bar.close()
|
|
330
329
|
raise FileExistsError(f"Path {dst} already exists")
|
|
331
|
-
self.do_instantiate_object(
|
|
330
|
+
self.do_instantiate_object(file, dst)
|
|
332
331
|
|
|
333
|
-
def do_instantiate_object(self,
|
|
334
|
-
src = self.cache.get_path(
|
|
332
|
+
def do_instantiate_object(self, file: "File", dst: str) -> None:
|
|
333
|
+
src = self.cache.get_path(file)
|
|
335
334
|
assert src is not None
|
|
336
335
|
|
|
337
336
|
try:
|
|
@@ -341,66 +340,33 @@ class Client(ABC):
|
|
|
341
340
|
copy2(src, dst)
|
|
342
341
|
|
|
343
342
|
def open_object(
|
|
344
|
-
self,
|
|
343
|
+
self, file: File, use_cache: bool = True, cb: Callback = DEFAULT_CALLBACK
|
|
345
344
|
) -> BinaryIO:
|
|
346
345
|
"""Open a file, including files in tar archives."""
|
|
347
|
-
|
|
348
|
-
if use_cache and (cache_path := self.cache.get_path(uid)):
|
|
346
|
+
if use_cache and (cache_path := self.cache.get_path(file)):
|
|
349
347
|
return open(cache_path, mode="rb") # noqa: SIM115
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
offset = location["offset"]
|
|
359
|
-
size = location["size"]
|
|
360
|
-
parent = location["parent"]
|
|
361
|
-
|
|
362
|
-
parent_uid = UniqueId(
|
|
363
|
-
parent["source"],
|
|
364
|
-
parent["path"],
|
|
365
|
-
parent["size"],
|
|
366
|
-
parent["etag"],
|
|
367
|
-
location=parent["location"],
|
|
368
|
-
)
|
|
369
|
-
f = self.open_object(parent_uid, use_cache=use_cache)
|
|
370
|
-
return FileSlice(f, offset, size, posixpath.basename(uid.path))
|
|
371
|
-
|
|
372
|
-
def download(self, uid: UniqueId, *, callback: Callback = DEFAULT_CALLBACK) -> None:
|
|
373
|
-
sync(get_loop(), functools.partial(self._download, uid, callback=callback))
|
|
374
|
-
|
|
375
|
-
async def _download(self, uid: UniqueId, *, callback: "Callback" = None) -> None:
|
|
376
|
-
if self.cache.contains(uid):
|
|
348
|
+
assert not file.location
|
|
349
|
+
return FileWrapper(self.fs.open(self.get_full_path(file.path)), cb) # type: ignore[return-value]
|
|
350
|
+
|
|
351
|
+
def download(self, file: File, *, callback: Callback = DEFAULT_CALLBACK) -> None:
|
|
352
|
+
sync(get_loop(), functools.partial(self._download, file, callback=callback))
|
|
353
|
+
|
|
354
|
+
async def _download(self, file: File, *, callback: "Callback" = None) -> None:
|
|
355
|
+
if self.cache.contains(file):
|
|
377
356
|
# Already in cache, so there's nothing to do.
|
|
378
357
|
return
|
|
379
|
-
await self._put_in_cache(
|
|
358
|
+
await self._put_in_cache(file, callback=callback)
|
|
380
359
|
|
|
381
|
-
def put_in_cache(self,
|
|
382
|
-
sync(get_loop(), functools.partial(self._put_in_cache,
|
|
360
|
+
def put_in_cache(self, file: File, *, callback: "Callback" = None) -> None:
|
|
361
|
+
sync(get_loop(), functools.partial(self._put_in_cache, file, callback=callback))
|
|
383
362
|
|
|
384
|
-
async def _put_in_cache(
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
loop = asyncio.get_running_loop()
|
|
390
|
-
await loop.run_in_executor(
|
|
391
|
-
None, functools.partial(self._download_from_tar, uid, callback=callback)
|
|
392
|
-
)
|
|
393
|
-
return
|
|
394
|
-
if uid.etag:
|
|
395
|
-
etag = await self.get_current_etag(uid)
|
|
396
|
-
if uid.etag != etag:
|
|
363
|
+
async def _put_in_cache(self, file: File, *, callback: "Callback" = None) -> None:
|
|
364
|
+
assert not file.location
|
|
365
|
+
if file.etag:
|
|
366
|
+
etag = await self.get_current_etag(file)
|
|
367
|
+
if file.etag != etag:
|
|
397
368
|
raise FileNotFoundError(
|
|
398
|
-
f"Invalid etag for {
|
|
399
|
-
f"expected {
|
|
369
|
+
f"Invalid etag for {file.source}/{file.path}: "
|
|
370
|
+
f"expected {file.etag}, got {etag}"
|
|
400
371
|
)
|
|
401
|
-
await self.cache.download(
|
|
402
|
-
|
|
403
|
-
def _download_from_tar(self, uid, *, callback: "Callback" = None):
|
|
404
|
-
with self._open_tar(uid, use_cache=False) as f:
|
|
405
|
-
contents = f.read()
|
|
406
|
-
self.cache.store_data(uid, contents)
|
|
372
|
+
await self.cache.download(file, self, callback=callback)
|
|
@@ -7,7 +7,6 @@ from urllib.parse import urlparse
|
|
|
7
7
|
|
|
8
8
|
from fsspec.implementations.local import LocalFileSystem
|
|
9
9
|
|
|
10
|
-
from datachain.cache import UniqueId
|
|
11
10
|
from datachain.lib.file import File
|
|
12
11
|
from datachain.storage import StorageURI
|
|
13
12
|
|
|
@@ -114,8 +113,8 @@ class FileClient(Client):
|
|
|
114
113
|
use_symlinks=use_symlinks,
|
|
115
114
|
)
|
|
116
115
|
|
|
117
|
-
async def get_current_etag(self,
|
|
118
|
-
info = self.fs.info(self.get_full_path(
|
|
116
|
+
async def get_current_etag(self, file: "File") -> str:
|
|
117
|
+
info = self.fs.info(self.get_full_path(file.path))
|
|
119
118
|
return self.info_to_file(info, "").etag
|
|
120
119
|
|
|
121
120
|
async def get_size(self, path: str) -> int:
|
|
@@ -49,7 +49,8 @@ class ArrowGenerator(Generator):
|
|
|
49
49
|
|
|
50
50
|
def process(self, file: File):
|
|
51
51
|
if file._caching_enabled:
|
|
52
|
-
|
|
52
|
+
file.ensure_cached()
|
|
53
|
+
path = file.get_local_path()
|
|
53
54
|
ds = dataset(path, schema=self.input_schema, **self.kwargs)
|
|
54
55
|
elif self.nrows:
|
|
55
56
|
path = _nrows_file(file, self.nrows)
|
|
@@ -58,6 +58,7 @@ from datachain.query.dataset import (
|
|
|
58
58
|
)
|
|
59
59
|
from datachain.query.schema import DEFAULT_DELIMITER, Column, DatasetRow
|
|
60
60
|
from datachain.sql.functions import path as pathfunc
|
|
61
|
+
from datachain.telemetry import telemetry
|
|
61
62
|
from datachain.utils import inside_notebook
|
|
62
63
|
|
|
63
64
|
if TYPE_CHECKING:
|
|
@@ -246,6 +247,9 @@ class DataChain(DatasetQuery):
|
|
|
246
247
|
**kwargs,
|
|
247
248
|
indexing_column_types=File._datachain_column_types,
|
|
248
249
|
)
|
|
250
|
+
|
|
251
|
+
telemetry.send_event_once("class", "datachain_init", **kwargs)
|
|
252
|
+
|
|
249
253
|
if settings:
|
|
250
254
|
self._settings = Settings(**settings)
|
|
251
255
|
else:
|