datachain 0.8.0__tar.gz → 0.8.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.8.0 → datachain-0.8.1}/.github/workflows/benchmarks.yml +1 -1
- {datachain-0.8.0 → datachain-0.8.1}/.github/workflows/release.yml +1 -1
- {datachain-0.8.0 → datachain-0.8.1}/.github/workflows/tests-studio.yml +1 -1
- {datachain-0.8.0 → datachain-0.8.1}/.github/workflows/tests.yml +3 -3
- {datachain-0.8.0 → datachain-0.8.1}/.pre-commit-config.yaml +1 -1
- {datachain-0.8.0/src/datachain.egg-info → datachain-0.8.1}/PKG-INFO +3 -3
- {datachain-0.8.0 → datachain-0.8.1}/docs/quick-start.md +4 -2
- {datachain-0.8.0 → datachain-0.8.1}/pyproject.toml +2 -2
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/catalog/catalog.py +3 -4
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/client/gcs.py +9 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/data_storage/warehouse.py +0 -1
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/arrow.py +82 -58
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/dc.py +12 -57
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/file.py +3 -1
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/listing.py +44 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/udf.py +0 -1
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/query/batch.py +32 -6
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/query/dataset.py +17 -17
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/query/dispatch.py +125 -125
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/query/session.py +8 -5
- datachain-0.8.1/src/datachain/query/udf.py +20 -0
- datachain-0.8.1/src/datachain/query/utils.py +42 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/utils.py +1 -1
- {datachain-0.8.0 → datachain-0.8.1/src/datachain.egg-info}/PKG-INFO +3 -3
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain.egg-info/SOURCES.txt +4 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain.egg-info/requires.txt +2 -2
- {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_catalog.py +6 -2
- datachain-0.8.1/tests/func/test_session.py +25 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_arrow.py +26 -0
- datachain-0.8.1/tests/unit/test_client_gcs.py +17 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_listing.py +29 -2
- {datachain-0.8.0 → datachain-0.8.1}/.cruft.json +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/.gitattributes +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/.github/codecov.yaml +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/.github/dependabot.yml +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/.gitignore +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/LICENSE +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/README.rst +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/docs/assets/datachain.svg +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/docs/contributing.md +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/docs/examples.md +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/docs/index.md +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/docs/overrides/main.html +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/docs/references/datachain.md +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/docs/references/datatype.md +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/docs/references/file.md +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/docs/references/index.md +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/docs/references/sql.md +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/docs/references/torch.md +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/docs/references/udf.md +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/docs/tutorials.md +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/examples/multimodal/wds.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/mkdocs.yml +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/noxfile.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/setup.cfg +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/__main__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/asyn.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/cache.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/cli.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/cli_utils.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/client/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/client/azure.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/client/hf.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/client/local.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/client/s3.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/config.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/dataset.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/error.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/func/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/func/array.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/func/base.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/func/conditional.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/func/func.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/func/numeric.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/func/path.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/func/random.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/func/string.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/func/window.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/job.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/clip.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/diff.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/hf.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/image.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/settings.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/tar.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/text.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/utils.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/listing.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/model/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/model/bbox.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/model/pose.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/model/segment.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/node.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/progress.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/py.typed +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/query/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/query/metrics.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/query/params.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/query/queue.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/query/schema.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/remote/studio.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/types.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/sql/utils.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/studio.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/telemetry.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/conftest.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/data.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/examples/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/examples/test_examples.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/examples/wds_data.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/func/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_client.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_datachain.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_datasets.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_listing.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_ls.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_metrics.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_pull.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_pytorch.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_query.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/func/test_toolkit.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/scripts/feature_class.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/test_atomicity.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/test_cli_e2e.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/test_cli_studio.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/test_query_e2e.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/test_telemetry.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_models.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_asyn.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_cache.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_catalog.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_client.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_config.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_dataset.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_func.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_metastore.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_query.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_query_params.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_serializer.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_session.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_utils.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.8.0 → datachain-0.8.1}/tests/utils.py +0 -0
|
@@ -37,7 +37,7 @@ jobs:
|
|
|
37
37
|
python-version: '3.9'
|
|
38
38
|
|
|
39
39
|
- name: Setup uv
|
|
40
|
-
uses: astral-sh/setup-uv@
|
|
40
|
+
uses: astral-sh/setup-uv@v5
|
|
41
41
|
with:
|
|
42
42
|
enable-cache: true
|
|
43
43
|
cache-suffix: lint
|
|
@@ -94,7 +94,7 @@ jobs:
|
|
|
94
94
|
python-version: ${{ matrix.pyv }}
|
|
95
95
|
|
|
96
96
|
- name: Setup uv
|
|
97
|
-
uses: astral-sh/setup-uv@
|
|
97
|
+
uses: astral-sh/setup-uv@v5
|
|
98
98
|
with:
|
|
99
99
|
enable-cache: true
|
|
100
100
|
cache-suffix: tests-${{ matrix.pyv }}
|
|
@@ -157,7 +157,7 @@ jobs:
|
|
|
157
157
|
python-version: ${{ matrix.pyv }}
|
|
158
158
|
|
|
159
159
|
- name: Setup uv
|
|
160
|
-
uses: astral-sh/setup-uv@
|
|
160
|
+
uses: astral-sh/setup-uv@v5
|
|
161
161
|
with:
|
|
162
162
|
enable-cache: true
|
|
163
163
|
cache-suffix: examples-${{ matrix.pyv }}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.1
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -84,7 +84,7 @@ Requires-Dist: requests-mock; extra == "tests"
|
|
|
84
84
|
Requires-Dist: scipy; extra == "tests"
|
|
85
85
|
Provides-Extra: dev
|
|
86
86
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
87
|
-
Requires-Dist: mypy==1.
|
|
87
|
+
Requires-Dist: mypy==1.14.0; extra == "dev"
|
|
88
88
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
89
89
|
Requires-Dist: types-pytz; extra == "dev"
|
|
90
90
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
@@ -99,7 +99,7 @@ Requires-Dist: unstructured[pdf]; extra == "examples"
|
|
|
99
99
|
Requires-Dist: pdfplumber==0.11.4; extra == "examples"
|
|
100
100
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
101
101
|
Requires-Dist: onnx==1.16.1; extra == "examples"
|
|
102
|
-
Requires-Dist: ultralytics==8.3.
|
|
102
|
+
Requires-Dist: ultralytics==8.3.53; extra == "examples"
|
|
103
103
|
|
|
104
104
|
================
|
|
105
105
|
|logo| DataChain
|
|
@@ -59,6 +59,8 @@ Batch inference with a simple sentiment model using the
|
|
|
59
59
|
pip install transformers
|
|
60
60
|
```
|
|
61
61
|
|
|
62
|
+
Note, `transformers` works only if `torch`, `tensorflow` >= 2.0, or `flax` are installed.
|
|
63
|
+
|
|
62
64
|
The code below downloads files from the cloud, and applies a
|
|
63
65
|
user-defined function to each one of them. All files with a positive
|
|
64
66
|
sentiment detected are then copied to the local directory.
|
|
@@ -114,13 +116,14 @@ DataChain can parallelize API calls; the free Mistral tier supports up
|
|
|
114
116
|
to 4 requests at the same time.
|
|
115
117
|
|
|
116
118
|
``` py
|
|
119
|
+
import os
|
|
117
120
|
from mistralai import Mistral
|
|
118
121
|
from datachain import File, DataChain, Column
|
|
119
122
|
|
|
120
123
|
PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
|
|
121
124
|
|
|
122
125
|
def eval_dialogue(file: File) -> bool:
|
|
123
|
-
client = Mistral()
|
|
126
|
+
client = Mistral(api_key = os.environ["MISTRAL_API_KEY"])
|
|
124
127
|
response = client.chat.complete(
|
|
125
128
|
model="open-mixtral-8x22b",
|
|
126
129
|
messages=[{"role": "system", "content": PROMPT},
|
|
@@ -130,7 +133,6 @@ def eval_dialogue(file: File) -> bool:
|
|
|
130
133
|
|
|
131
134
|
chain = (
|
|
132
135
|
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
|
|
133
|
-
.settings(parallel=4, cache=True)
|
|
134
136
|
.map(is_success=eval_dialogue)
|
|
135
137
|
.save("mistral_files")
|
|
136
138
|
)
|
|
@@ -96,7 +96,7 @@ tests = [
|
|
|
96
96
|
]
|
|
97
97
|
dev = [
|
|
98
98
|
"datachain[docs,tests]",
|
|
99
|
-
"mypy==1.
|
|
99
|
+
"mypy==1.14.0",
|
|
100
100
|
"types-python-dateutil",
|
|
101
101
|
"types-pytz",
|
|
102
102
|
"types-PyYAML",
|
|
@@ -112,7 +112,7 @@ examples = [
|
|
|
112
112
|
"pdfplumber==0.11.4",
|
|
113
113
|
"huggingface_hub[hf_transfer]",
|
|
114
114
|
"onnx==1.16.1",
|
|
115
|
-
"ultralytics==8.3.
|
|
115
|
+
"ultralytics==8.3.53"
|
|
116
116
|
]
|
|
117
117
|
|
|
118
118
|
[project.urls]
|
|
@@ -52,6 +52,7 @@ from datachain.error import (
|
|
|
52
52
|
QueryScriptCancelError,
|
|
53
53
|
QueryScriptRunError,
|
|
54
54
|
)
|
|
55
|
+
from datachain.lib.listing import get_listing
|
|
55
56
|
from datachain.node import DirType, Node, NodeWithPath
|
|
56
57
|
from datachain.nodes_thread_pool import NodesThreadPool
|
|
57
58
|
from datachain.remote.studio import StudioClient
|
|
@@ -599,7 +600,7 @@ class Catalog:
|
|
|
599
600
|
source, session=self.session, update=update, object_name=object_name
|
|
600
601
|
)
|
|
601
602
|
|
|
602
|
-
list_ds_name, list_uri, list_path, _ =
|
|
603
|
+
list_ds_name, list_uri, list_path, _ = get_listing(
|
|
603
604
|
source, self.session, update=update
|
|
604
605
|
)
|
|
605
606
|
|
|
@@ -697,11 +698,9 @@ class Catalog:
|
|
|
697
698
|
)
|
|
698
699
|
indexed_sources = []
|
|
699
700
|
for source in dataset_sources:
|
|
700
|
-
from datachain.lib.dc import DataChain
|
|
701
|
-
|
|
702
701
|
client = self.get_client(source, **client_config)
|
|
703
702
|
uri = client.uri
|
|
704
|
-
dataset_name, _, _, _ =
|
|
703
|
+
dataset_name, _, _, _ = get_listing(uri, self.session)
|
|
705
704
|
listing = Listing(
|
|
706
705
|
self.metastore.clone(),
|
|
707
706
|
self.warehouse.clone(),
|
|
@@ -32,6 +32,15 @@ class GCSClient(Client):
|
|
|
32
32
|
|
|
33
33
|
return cast(GCSFileSystem, super().create_fs(**kwargs))
|
|
34
34
|
|
|
35
|
+
def url(self, path: str, expires: int = 3600, **kwargs) -> str:
|
|
36
|
+
try:
|
|
37
|
+
return self.fs.sign(self.get_full_path(path), expiration=expires, **kwargs)
|
|
38
|
+
except AttributeError as exc:
|
|
39
|
+
is_anon = self.fs.storage_options.get("token") == "anon"
|
|
40
|
+
if is_anon and "you need a private key to sign credentials" in str(exc):
|
|
41
|
+
return f"https://storage.googleapis.com/{self.name}/{path}"
|
|
42
|
+
raise
|
|
43
|
+
|
|
35
44
|
@staticmethod
|
|
36
45
|
def parse_timestamp(timestamp: str) -> datetime:
|
|
37
46
|
"""
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
from collections.abc import Sequence
|
|
2
|
-
from
|
|
2
|
+
from itertools import islice
|
|
3
3
|
from typing import TYPE_CHECKING, Any, Optional
|
|
4
4
|
|
|
5
|
+
import fsspec.implementations.reference
|
|
5
6
|
import orjson
|
|
6
7
|
import pyarrow as pa
|
|
8
|
+
from fsspec.core import split_protocol
|
|
7
9
|
from pyarrow.dataset import CsvFileFormat, dataset
|
|
8
10
|
from tqdm import tqdm
|
|
9
11
|
|
|
@@ -25,7 +27,18 @@ if TYPE_CHECKING:
|
|
|
25
27
|
DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY = b"DataChain SignalSchema"
|
|
26
28
|
|
|
27
29
|
|
|
30
|
+
class ReferenceFileSystem(fsspec.implementations.reference.ReferenceFileSystem):
|
|
31
|
+
def _open(self, path, mode="rb", *args, **kwargs):
|
|
32
|
+
# overriding because `fsspec`'s `ReferenceFileSystem._open`
|
|
33
|
+
# reads the whole file in-memory.
|
|
34
|
+
(uri,) = self.references[path]
|
|
35
|
+
protocol, _ = split_protocol(uri)
|
|
36
|
+
return self.fss[protocol]._open(uri, mode, *args, **kwargs)
|
|
37
|
+
|
|
38
|
+
|
|
28
39
|
class ArrowGenerator(Generator):
|
|
40
|
+
DEFAULT_BATCH_SIZE = 2**17 # same as `pyarrow._dataset._DEFAULT_BATCH_SIZE`
|
|
41
|
+
|
|
29
42
|
def __init__(
|
|
30
43
|
self,
|
|
31
44
|
input_schema: Optional["pa.Schema"] = None,
|
|
@@ -55,57 +68,80 @@ class ArrowGenerator(Generator):
|
|
|
55
68
|
def process(self, file: File):
|
|
56
69
|
if file._caching_enabled:
|
|
57
70
|
file.ensure_cached()
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
path = _nrows_file(file, self.nrows)
|
|
62
|
-
ds = dataset(path, schema=self.input_schema, **self.kwargs)
|
|
71
|
+
cache_path = file.get_local_path()
|
|
72
|
+
fs_path = file.path
|
|
73
|
+
fs = ReferenceFileSystem({fs_path: [cache_path]})
|
|
63
74
|
else:
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
75
|
+
fs, fs_path = file.get_fs(), file.get_path()
|
|
76
|
+
|
|
77
|
+
ds = dataset(fs_path, schema=self.input_schema, filesystem=fs, **self.kwargs)
|
|
78
|
+
|
|
68
79
|
hf_schema = _get_hf_schema(ds.schema)
|
|
69
80
|
use_datachain_schema = (
|
|
70
81
|
bool(ds.schema.metadata)
|
|
71
82
|
and DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY in ds.schema.metadata
|
|
72
83
|
)
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
84
|
+
|
|
85
|
+
kw = {}
|
|
86
|
+
if self.nrows:
|
|
87
|
+
kw = {"batch_size": min(self.DEFAULT_BATCH_SIZE, self.nrows)}
|
|
88
|
+
|
|
89
|
+
def iter_records():
|
|
90
|
+
for record_batch in ds.to_batches(**kw):
|
|
91
|
+
yield from record_batch.to_pylist()
|
|
92
|
+
|
|
93
|
+
it = islice(iter_records(), self.nrows)
|
|
94
|
+
with tqdm(it, desc="Parsed by pyarrow", unit="rows", total=self.nrows) as pbar:
|
|
95
|
+
for index, record in enumerate(pbar):
|
|
96
|
+
yield self._process_record(
|
|
97
|
+
record, file, index, hf_schema, use_datachain_schema
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
def _process_record(
|
|
101
|
+
self,
|
|
102
|
+
record: dict[str, Any],
|
|
103
|
+
file: File,
|
|
104
|
+
index: int,
|
|
105
|
+
hf_schema: Optional[tuple["Features", dict[str, "DataType"]]],
|
|
106
|
+
use_datachain_schema: bool,
|
|
107
|
+
):
|
|
108
|
+
if use_datachain_schema and self.output_schema:
|
|
109
|
+
vals = [_nested_model_instantiate(record, self.output_schema)]
|
|
110
|
+
else:
|
|
111
|
+
vals = self._process_non_datachain_record(record, hf_schema)
|
|
112
|
+
|
|
113
|
+
if self.source:
|
|
114
|
+
kwargs: dict = self.kwargs
|
|
115
|
+
# Can't serialize CsvFileFormat; may lose formatting options.
|
|
116
|
+
if isinstance(kwargs.get("format"), CsvFileFormat):
|
|
117
|
+
kwargs["format"] = "csv"
|
|
118
|
+
arrow_file = ArrowRow(file=file, index=index, kwargs=kwargs)
|
|
119
|
+
return [arrow_file, *vals]
|
|
120
|
+
return vals
|
|
121
|
+
|
|
122
|
+
def _process_non_datachain_record(
|
|
123
|
+
self,
|
|
124
|
+
record: dict[str, Any],
|
|
125
|
+
hf_schema: Optional[tuple["Features", dict[str, "DataType"]]],
|
|
126
|
+
):
|
|
127
|
+
vals = list(record.values())
|
|
128
|
+
if not self.output_schema:
|
|
129
|
+
return vals
|
|
130
|
+
|
|
131
|
+
fields = self.output_schema.model_fields
|
|
132
|
+
vals_dict = {}
|
|
133
|
+
for i, ((field, field_info), val) in enumerate(zip(fields.items(), vals)):
|
|
134
|
+
anno = field_info.annotation
|
|
135
|
+
if hf_schema:
|
|
136
|
+
from datachain.lib.hf import convert_feature
|
|
137
|
+
|
|
138
|
+
feat = list(hf_schema[0].values())[i]
|
|
139
|
+
vals_dict[field] = convert_feature(val, feat, anno)
|
|
140
|
+
elif ModelStore.is_pydantic(anno):
|
|
141
|
+
vals_dict[field] = anno(**val) # type: ignore[misc]
|
|
142
|
+
else:
|
|
143
|
+
vals_dict[field] = val
|
|
144
|
+
return [self.output_schema(**vals_dict)]
|
|
109
145
|
|
|
110
146
|
|
|
111
147
|
def infer_schema(chain: "DataChain", **kwargs) -> pa.Schema:
|
|
@@ -190,18 +226,6 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa:
|
|
|
190
226
|
raise TypeError(f"{col_type!r} datatypes not supported, column: {column}")
|
|
191
227
|
|
|
192
228
|
|
|
193
|
-
def _nrows_file(file: File, nrows: int) -> str:
|
|
194
|
-
tf = NamedTemporaryFile(delete=False) # noqa: SIM115
|
|
195
|
-
with file.open(mode="r") as reader:
|
|
196
|
-
with open(tf.name, "a") as writer:
|
|
197
|
-
for row, line in enumerate(reader):
|
|
198
|
-
if row >= nrows:
|
|
199
|
-
break
|
|
200
|
-
writer.write(line)
|
|
201
|
-
writer.write("\n")
|
|
202
|
-
return tf.name
|
|
203
|
-
|
|
204
|
-
|
|
205
229
|
def _get_hf_schema(
|
|
206
230
|
schema: "pa.Schema",
|
|
207
231
|
) -> Optional[tuple["Features", dict[str, "DataType"]]]:
|
|
@@ -11,7 +11,6 @@ from typing import (
|
|
|
11
11
|
BinaryIO,
|
|
12
12
|
Callable,
|
|
13
13
|
ClassVar,
|
|
14
|
-
Literal,
|
|
15
14
|
Optional,
|
|
16
15
|
TypeVar,
|
|
17
16
|
Union,
|
|
@@ -24,8 +23,6 @@ from pydantic import BaseModel
|
|
|
24
23
|
from sqlalchemy.sql.functions import GenericFunction
|
|
25
24
|
from sqlalchemy.sql.sqltypes import NullType
|
|
26
25
|
|
|
27
|
-
from datachain.client import Client
|
|
28
|
-
from datachain.client.local import FileClient
|
|
29
26
|
from datachain.dataset import DatasetRecord
|
|
30
27
|
from datachain.func.base import Function
|
|
31
28
|
from datachain.func.func import Func
|
|
@@ -33,13 +30,9 @@ from datachain.lib.convert.python_to_sql import python_to_sql
|
|
|
33
30
|
from datachain.lib.convert.values_to_tuples import values_to_tuples
|
|
34
31
|
from datachain.lib.data_model import DataModel, DataType, DataValue, dict_to_data_model
|
|
35
32
|
from datachain.lib.dataset_info import DatasetInfo
|
|
36
|
-
from datachain.lib.file import ArrowRow, File, get_file_type
|
|
33
|
+
from datachain.lib.file import ArrowRow, File, FileType, get_file_type
|
|
37
34
|
from datachain.lib.file import ExportPlacement as FileExportPlacement
|
|
38
|
-
from datachain.lib.listing import
|
|
39
|
-
list_bucket,
|
|
40
|
-
ls,
|
|
41
|
-
parse_listing_uri,
|
|
42
|
-
)
|
|
35
|
+
from datachain.lib.listing import get_listing, list_bucket, ls
|
|
43
36
|
from datachain.lib.listing_info import ListingInfo
|
|
44
37
|
from datachain.lib.meta_formats import read_meta
|
|
45
38
|
from datachain.lib.model_store import ModelStore
|
|
@@ -403,53 +396,12 @@ class DataChain:
|
|
|
403
396
|
self.signals_schema |= signals_schema
|
|
404
397
|
return self
|
|
405
398
|
|
|
406
|
-
@classmethod
|
|
407
|
-
def parse_uri(
|
|
408
|
-
cls, uri: str, session: Session, update: bool = False
|
|
409
|
-
) -> tuple[str, str, str, bool]:
|
|
410
|
-
"""Returns correct listing dataset name that must be used for saving listing
|
|
411
|
-
operation. It takes into account existing listings and reusability of those.
|
|
412
|
-
It also returns boolean saying if returned dataset name is reused / already
|
|
413
|
-
exists or not, and it returns correct listing path that should be used to find
|
|
414
|
-
rows based on uri.
|
|
415
|
-
"""
|
|
416
|
-
catalog = session.catalog
|
|
417
|
-
cache = catalog.cache
|
|
418
|
-
client_config = catalog.client_config
|
|
419
|
-
|
|
420
|
-
client = Client.get_client(uri, cache, **client_config)
|
|
421
|
-
ds_name, list_uri, list_path = parse_listing_uri(uri, cache, client_config)
|
|
422
|
-
listing = None
|
|
423
|
-
|
|
424
|
-
listings = [
|
|
425
|
-
ls
|
|
426
|
-
for ls in catalog.listings()
|
|
427
|
-
if not ls.is_expired and ls.contains(ds_name)
|
|
428
|
-
]
|
|
429
|
-
|
|
430
|
-
if listings:
|
|
431
|
-
if update:
|
|
432
|
-
# choosing the smallest possible one to minimize update time
|
|
433
|
-
listing = sorted(listings, key=lambda ls: len(ls.name))[0]
|
|
434
|
-
else:
|
|
435
|
-
# no need to update, choosing the most recent one
|
|
436
|
-
listing = sorted(listings, key=lambda ls: ls.created_at)[-1]
|
|
437
|
-
|
|
438
|
-
if isinstance(client, FileClient) and listing and listing.name != ds_name:
|
|
439
|
-
# For local file system we need to fix listing path / prefix
|
|
440
|
-
# if we are reusing existing listing
|
|
441
|
-
list_path = f'{ds_name.strip("/").removeprefix(listing.name)}/{list_path}'
|
|
442
|
-
|
|
443
|
-
ds_name = listing.name if listing else ds_name
|
|
444
|
-
|
|
445
|
-
return ds_name, list_uri, list_path, bool(listing)
|
|
446
|
-
|
|
447
399
|
@classmethod
|
|
448
400
|
def from_storage(
|
|
449
401
|
cls,
|
|
450
402
|
uri,
|
|
451
403
|
*,
|
|
452
|
-
type:
|
|
404
|
+
type: FileType = "binary",
|
|
453
405
|
session: Optional[Session] = None,
|
|
454
406
|
settings: Optional[dict] = None,
|
|
455
407
|
in_memory: bool = False,
|
|
@@ -482,7 +434,7 @@ class DataChain:
|
|
|
482
434
|
cache = session.catalog.cache
|
|
483
435
|
client_config = session.catalog.client_config
|
|
484
436
|
|
|
485
|
-
list_ds_name, list_uri, list_path, list_ds_exists =
|
|
437
|
+
list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
|
|
486
438
|
uri, session, update=update
|
|
487
439
|
)
|
|
488
440
|
|
|
@@ -548,7 +500,7 @@ class DataChain:
|
|
|
548
500
|
def from_json(
|
|
549
501
|
cls,
|
|
550
502
|
path,
|
|
551
|
-
type:
|
|
503
|
+
type: FileType = "text",
|
|
552
504
|
spec: Optional[DataType] = None,
|
|
553
505
|
schema_from: Optional[str] = "auto",
|
|
554
506
|
jmespath: Optional[str] = None,
|
|
@@ -605,7 +557,9 @@ class DataChain:
|
|
|
605
557
|
nrows=nrows,
|
|
606
558
|
)
|
|
607
559
|
}
|
|
608
|
-
|
|
560
|
+
# disable prefetch if nrows is set
|
|
561
|
+
settings = {"prefetch": 0} if nrows else {}
|
|
562
|
+
return chain.settings(**settings).gen(**signal_dict) # type: ignore[misc, arg-type]
|
|
609
563
|
|
|
610
564
|
def explode(
|
|
611
565
|
self,
|
|
@@ -1942,7 +1896,10 @@ class DataChain:
|
|
|
1942
1896
|
|
|
1943
1897
|
if source:
|
|
1944
1898
|
output = {"source": ArrowRow} | output # type: ignore[assignment,operator]
|
|
1945
|
-
|
|
1899
|
+
|
|
1900
|
+
# disable prefetch if nrows is set
|
|
1901
|
+
settings = {"prefetch": 0} if nrows else {}
|
|
1902
|
+
return self.settings(**settings).gen( # type: ignore[arg-type]
|
|
1946
1903
|
ArrowGenerator(schema, model, source, nrows, **kwargs), output=output
|
|
1947
1904
|
)
|
|
1948
1905
|
|
|
@@ -2024,8 +1981,6 @@ class DataChain:
|
|
|
2024
1981
|
else:
|
|
2025
1982
|
msg = f"error parsing csv - incompatible output type {type(output)}"
|
|
2026
1983
|
raise DatasetPrepareError(chain.name, msg)
|
|
2027
|
-
elif nrows:
|
|
2028
|
-
nrows += 1
|
|
2029
1984
|
|
|
2030
1985
|
parse_options = ParseOptions(delimiter=delimiter)
|
|
2031
1986
|
read_options = ReadOptions(column_names=column_names)
|
|
@@ -39,6 +39,8 @@ logger = logging.getLogger("datachain")
|
|
|
39
39
|
# how to create file path when exporting
|
|
40
40
|
ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
|
|
41
41
|
|
|
42
|
+
FileType = Literal["binary", "text", "image"]
|
|
43
|
+
|
|
42
44
|
|
|
43
45
|
class VFileError(DataChainError):
|
|
44
46
|
def __init__(self, file: "File", message: str, vtype: str = ""):
|
|
@@ -470,7 +472,7 @@ class ArrowRow(DataModel):
|
|
|
470
472
|
return record_batch.to_pylist()[0]
|
|
471
473
|
|
|
472
474
|
|
|
473
|
-
def get_file_type(type_:
|
|
475
|
+
def get_file_type(type_: FileType = "binary") -> type[File]:
|
|
474
476
|
file: type[File] = File
|
|
475
477
|
if type_ == "text":
|
|
476
478
|
file = TextFile
|
|
@@ -15,6 +15,7 @@ from datachain.utils import uses_glob
|
|
|
15
15
|
|
|
16
16
|
if TYPE_CHECKING:
|
|
17
17
|
from datachain.lib.dc import DataChain
|
|
18
|
+
from datachain.query.session import Session
|
|
18
19
|
|
|
19
20
|
LISTING_TTL = 4 * 60 * 60 # cached listing lasts 4 hours
|
|
20
21
|
LISTING_PREFIX = "lst__" # listing datasets start with this name
|
|
@@ -108,3 +109,46 @@ def listing_uri_from_name(dataset_name: str) -> str:
|
|
|
108
109
|
if not is_listing_dataset(dataset_name):
|
|
109
110
|
raise ValueError(f"Dataset {dataset_name} is not a listing")
|
|
110
111
|
return dataset_name.removeprefix(LISTING_PREFIX)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def get_listing(
|
|
115
|
+
uri: str, session: "Session", update: bool = False
|
|
116
|
+
) -> tuple[str, str, str, bool]:
|
|
117
|
+
"""Returns correct listing dataset name that must be used for saving listing
|
|
118
|
+
operation. It takes into account existing listings and reusability of those.
|
|
119
|
+
It also returns boolean saying if returned dataset name is reused / already
|
|
120
|
+
exists or not (on update it always returns False - just because there was no
|
|
121
|
+
reason to complicate it so far). And it returns correct listing path that should
|
|
122
|
+
be used to find rows based on uri.
|
|
123
|
+
"""
|
|
124
|
+
from datachain.client.local import FileClient
|
|
125
|
+
|
|
126
|
+
catalog = session.catalog
|
|
127
|
+
cache = catalog.cache
|
|
128
|
+
client_config = catalog.client_config
|
|
129
|
+
|
|
130
|
+
client = Client.get_client(uri, cache, **client_config)
|
|
131
|
+
ds_name, list_uri, list_path = parse_listing_uri(uri, cache, client_config)
|
|
132
|
+
listing = None
|
|
133
|
+
|
|
134
|
+
listings = [
|
|
135
|
+
ls for ls in catalog.listings() if not ls.is_expired and ls.contains(ds_name)
|
|
136
|
+
]
|
|
137
|
+
|
|
138
|
+
# if no need to update - choosing the most recent one;
|
|
139
|
+
# otherwise, we'll using the exact original `ds_name`` in this case:
|
|
140
|
+
# - if a "bigger" listing exists, we don't want to update it, it's better
|
|
141
|
+
# to create a new "smaller" one on "update=True"
|
|
142
|
+
# - if an exact listing exists it will have the same name as `ds_name`
|
|
143
|
+
# anyway below
|
|
144
|
+
if listings and not update:
|
|
145
|
+
listing = sorted(listings, key=lambda ls: ls.created_at)[-1]
|
|
146
|
+
|
|
147
|
+
# for local file system we need to fix listing path / prefix
|
|
148
|
+
# if we are reusing existing listing
|
|
149
|
+
if isinstance(client, FileClient) and listing and listing.name != ds_name:
|
|
150
|
+
list_path = f'{ds_name.strip("/").removeprefix(listing.name)}/{list_path}'
|
|
151
|
+
|
|
152
|
+
ds_name = listing.name if listing else ds_name
|
|
153
|
+
|
|
154
|
+
return ds_name, list_uri, list_path, bool(listing)
|