datachain 0.8.0__tar.gz → 0.8.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.8.0 → datachain-0.8.2}/.github/workflows/benchmarks.yml +1 -1
- {datachain-0.8.0 → datachain-0.8.2}/.github/workflows/release.yml +1 -1
- {datachain-0.8.0 → datachain-0.8.2}/.github/workflows/tests-studio.yml +1 -1
- {datachain-0.8.0 → datachain-0.8.2}/.github/workflows/tests.yml +3 -3
- {datachain-0.8.0 → datachain-0.8.2}/.pre-commit-config.yaml +1 -1
- {datachain-0.8.0/src/datachain.egg-info → datachain-0.8.2}/PKG-INFO +85 -3
- {datachain-0.8.0 → datachain-0.8.2}/README.rst +82 -0
- {datachain-0.8.0 → datachain-0.8.2}/docs/quick-start.md +10 -8
- {datachain-0.8.0 → datachain-0.8.2}/pyproject.toml +2 -2
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/catalog/catalog.py +3 -4
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/client/gcs.py +10 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/data_storage/warehouse.py +0 -1
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/arrow.py +82 -58
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/dc.py +12 -57
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/file.py +3 -1
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/listing.py +44 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/udf.py +0 -1
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/query/batch.py +32 -6
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/query/dataset.py +17 -17
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/query/dispatch.py +125 -125
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/query/session.py +8 -5
- datachain-0.8.2/src/datachain/query/udf.py +20 -0
- datachain-0.8.2/src/datachain/query/utils.py +42 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/utils.py +1 -1
- {datachain-0.8.0 → datachain-0.8.2/src/datachain.egg-info}/PKG-INFO +85 -3
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain.egg-info/SOURCES.txt +4 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain.egg-info/requires.txt +2 -2
- {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_catalog.py +6 -2
- datachain-0.8.2/tests/func/test_session.py +25 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_arrow.py +26 -0
- datachain-0.8.2/tests/unit/test_client_gcs.py +6 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_listing.py +29 -2
- {datachain-0.8.0 → datachain-0.8.2}/.cruft.json +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/.gitattributes +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/.github/codecov.yaml +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/.github/dependabot.yml +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/.gitignore +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/LICENSE +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/docs/assets/datachain.svg +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/docs/contributing.md +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/docs/css/github-permalink-style.css +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/docs/examples.md +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/docs/index.md +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/docs/overrides/main.html +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/docs/references/datachain.md +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/docs/references/datatype.md +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/docs/references/file.md +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/docs/references/index.md +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/docs/references/sql.md +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/docs/references/torch.md +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/docs/references/udf.md +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/docs/tutorials.md +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/examples/get_started/torch-loader.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/examples/llm_and_nlp/unstructured-embeddings-gen.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/examples/multimodal/wds.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/mkdocs.yml +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/noxfile.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/setup.cfg +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/__main__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/asyn.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/cache.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/cli.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/cli_utils.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/client/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/client/azure.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/client/hf.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/client/local.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/client/s3.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/config.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/dataset.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/error.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/func/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/func/array.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/func/base.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/func/conditional.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/func/func.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/func/numeric.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/func/path.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/func/random.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/func/string.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/func/window.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/job.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/clip.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/diff.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/hf.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/image.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/meta_formats.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/pytorch.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/settings.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/signal_schema.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/tar.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/text.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/utils.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/listing.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/model/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/model/bbox.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/model/pose.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/model/segment.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/node.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/progress.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/py.typed +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/query/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/query/metrics.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/query/params.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/query/queue.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/query/schema.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/remote/studio.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/types.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/sql/utils.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/studio.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/telemetry.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/toolkit/split.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/conftest.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/data.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/examples/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/examples/test_examples.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/examples/wds_data.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/func/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_client.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_datachain.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_datasets.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_listing.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_ls.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_metrics.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_pull.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_pytorch.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_query.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/func/test_toolkit.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/scripts/feature_class.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/test_atomicity.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/test_cli_e2e.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/test_cli_studio.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/test_query_e2e.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/test_telemetry.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_diff.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_models.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_signal_schema.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_asyn.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_cache.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_catalog.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_client.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_config.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_dataset.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_func.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_metastore.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_query.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_query_params.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_serializer.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_session.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_utils.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.8.0 → datachain-0.8.2}/tests/utils.py +0 -0
|
@@ -37,7 +37,7 @@ jobs:
|
|
|
37
37
|
python-version: '3.9'
|
|
38
38
|
|
|
39
39
|
- name: Setup uv
|
|
40
|
-
uses: astral-sh/setup-uv@
|
|
40
|
+
uses: astral-sh/setup-uv@v5
|
|
41
41
|
with:
|
|
42
42
|
enable-cache: true
|
|
43
43
|
cache-suffix: lint
|
|
@@ -94,7 +94,7 @@ jobs:
|
|
|
94
94
|
python-version: ${{ matrix.pyv }}
|
|
95
95
|
|
|
96
96
|
- name: Setup uv
|
|
97
|
-
uses: astral-sh/setup-uv@
|
|
97
|
+
uses: astral-sh/setup-uv@v5
|
|
98
98
|
with:
|
|
99
99
|
enable-cache: true
|
|
100
100
|
cache-suffix: tests-${{ matrix.pyv }}
|
|
@@ -157,7 +157,7 @@ jobs:
|
|
|
157
157
|
python-version: ${{ matrix.pyv }}
|
|
158
158
|
|
|
159
159
|
- name: Setup uv
|
|
160
|
-
uses: astral-sh/setup-uv@
|
|
160
|
+
uses: astral-sh/setup-uv@v5
|
|
161
161
|
with:
|
|
162
162
|
enable-cache: true
|
|
163
163
|
cache-suffix: examples-${{ matrix.pyv }}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.2
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -84,7 +84,7 @@ Requires-Dist: requests-mock; extra == "tests"
|
|
|
84
84
|
Requires-Dist: scipy; extra == "tests"
|
|
85
85
|
Provides-Extra: dev
|
|
86
86
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
87
|
-
Requires-Dist: mypy==1.
|
|
87
|
+
Requires-Dist: mypy==1.14.0; extra == "dev"
|
|
88
88
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
89
89
|
Requires-Dist: types-pytz; extra == "dev"
|
|
90
90
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
@@ -99,7 +99,7 @@ Requires-Dist: unstructured[pdf]; extra == "examples"
|
|
|
99
99
|
Requires-Dist: pdfplumber==0.11.4; extra == "examples"
|
|
100
100
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
101
101
|
Requires-Dist: onnx==1.16.1; extra == "examples"
|
|
102
|
-
Requires-Dist: ultralytics==8.3.
|
|
102
|
+
Requires-Dist: ultralytics==8.3.53; extra == "examples"
|
|
103
103
|
|
|
104
104
|
================
|
|
105
105
|
|logo| DataChain
|
|
@@ -145,6 +145,88 @@ Getting Started
|
|
|
145
145
|
Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ and `Docs <https://docs.datachain.ai/>`_
|
|
146
146
|
to get started with `DataChain` and learn more.
|
|
147
147
|
|
|
148
|
+
.. code:: bash
|
|
149
|
+
|
|
150
|
+
pip install datachain
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
Example: download subset of files based on metadata
|
|
154
|
+
---------------------------------------------------
|
|
155
|
+
|
|
156
|
+
Sometimes users only need to download a specific subset of files from cloud storage,
|
|
157
|
+
rather than the entire dataset.
|
|
158
|
+
For example, you could use a JSON file's metadata to download just cat images with
|
|
159
|
+
high confidence scores.
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
.. code:: py
|
|
163
|
+
|
|
164
|
+
from datachain import Column, DataChain
|
|
165
|
+
|
|
166
|
+
meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
|
|
167
|
+
images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
|
|
168
|
+
|
|
169
|
+
images_id = images.map(id=lambda file: file.path.split('.')[-2])
|
|
170
|
+
annotated = images_id.merge(meta, on="id", right_on="meta.id")
|
|
171
|
+
|
|
172
|
+
likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
|
|
173
|
+
& (Column("meta.inference.class_") == "cat"))
|
|
174
|
+
likely_cats.export_files("high-confidence-cats/", signal="file")
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
Example: LLM based text-file evaluation
|
|
178
|
+
---------------------------------------
|
|
179
|
+
|
|
180
|
+
In this example, we evaluate chatbot conversations stored in text files
|
|
181
|
+
using LLM based evaluation.
|
|
182
|
+
|
|
183
|
+
.. code:: shell
|
|
184
|
+
|
|
185
|
+
$ pip install mistralai # Requires version >=1.0.0
|
|
186
|
+
$ export MISTRAL_API_KEY=_your_key_
|
|
187
|
+
|
|
188
|
+
Python code:
|
|
189
|
+
|
|
190
|
+
.. code:: py
|
|
191
|
+
|
|
192
|
+
from mistralai import Mistral
|
|
193
|
+
from datachain import File, DataChain, Column
|
|
194
|
+
|
|
195
|
+
PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
|
|
196
|
+
|
|
197
|
+
def eval_dialogue(file: File) -> bool:
|
|
198
|
+
client = Mistral()
|
|
199
|
+
response = client.chat.complete(
|
|
200
|
+
model="open-mixtral-8x22b",
|
|
201
|
+
messages=[{"role": "system", "content": PROMPT},
|
|
202
|
+
{"role": "user", "content": file.read()}])
|
|
203
|
+
result = response.choices[0].message.content
|
|
204
|
+
return result.lower().startswith("success")
|
|
205
|
+
|
|
206
|
+
chain = (
|
|
207
|
+
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
|
|
208
|
+
.settings(parallel=4, cache=True)
|
|
209
|
+
.map(is_success=eval_dialogue)
|
|
210
|
+
.save("mistral_files")
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
successful_chain = chain.filter(Column("is_success") == True)
|
|
214
|
+
successful_chain.export_files("./output_mistral")
|
|
215
|
+
|
|
216
|
+
print(f"{successful_chain.count()} files were exported")
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
With the instruction above, the Mistral model considers 31/50 files to hold the successful dialogues:
|
|
221
|
+
|
|
222
|
+
.. code:: shell
|
|
223
|
+
|
|
224
|
+
$ ls output_mistral/datachain-demo/chatbot-KiT/
|
|
225
|
+
1.txt 15.txt 18.txt 2.txt 22.txt 25.txt 28.txt 33.txt 37.txt 4.txt 41.txt ...
|
|
226
|
+
$ ls output_mistral/datachain-demo/chatbot-KiT/ | wc -l
|
|
227
|
+
31
|
|
228
|
+
|
|
229
|
+
|
|
148
230
|
Key Features
|
|
149
231
|
============
|
|
150
232
|
|
|
@@ -42,6 +42,88 @@ Getting Started
|
|
|
42
42
|
Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ and `Docs <https://docs.datachain.ai/>`_
|
|
43
43
|
to get started with `DataChain` and learn more.
|
|
44
44
|
|
|
45
|
+
.. code:: bash
|
|
46
|
+
|
|
47
|
+
pip install datachain
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
Example: download subset of files based on metadata
|
|
51
|
+
---------------------------------------------------
|
|
52
|
+
|
|
53
|
+
Sometimes users only need to download a specific subset of files from cloud storage,
|
|
54
|
+
rather than the entire dataset.
|
|
55
|
+
For example, you could use a JSON file's metadata to download just cat images with
|
|
56
|
+
high confidence scores.
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
.. code:: py
|
|
60
|
+
|
|
61
|
+
from datachain import Column, DataChain
|
|
62
|
+
|
|
63
|
+
meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
|
|
64
|
+
images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
|
|
65
|
+
|
|
66
|
+
images_id = images.map(id=lambda file: file.path.split('.')[-2])
|
|
67
|
+
annotated = images_id.merge(meta, on="id", right_on="meta.id")
|
|
68
|
+
|
|
69
|
+
likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
|
|
70
|
+
& (Column("meta.inference.class_") == "cat"))
|
|
71
|
+
likely_cats.export_files("high-confidence-cats/", signal="file")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
Example: LLM based text-file evaluation
|
|
75
|
+
---------------------------------------
|
|
76
|
+
|
|
77
|
+
In this example, we evaluate chatbot conversations stored in text files
|
|
78
|
+
using LLM based evaluation.
|
|
79
|
+
|
|
80
|
+
.. code:: shell
|
|
81
|
+
|
|
82
|
+
$ pip install mistralai # Requires version >=1.0.0
|
|
83
|
+
$ export MISTRAL_API_KEY=_your_key_
|
|
84
|
+
|
|
85
|
+
Python code:
|
|
86
|
+
|
|
87
|
+
.. code:: py
|
|
88
|
+
|
|
89
|
+
from mistralai import Mistral
|
|
90
|
+
from datachain import File, DataChain, Column
|
|
91
|
+
|
|
92
|
+
PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
|
|
93
|
+
|
|
94
|
+
def eval_dialogue(file: File) -> bool:
|
|
95
|
+
client = Mistral()
|
|
96
|
+
response = client.chat.complete(
|
|
97
|
+
model="open-mixtral-8x22b",
|
|
98
|
+
messages=[{"role": "system", "content": PROMPT},
|
|
99
|
+
{"role": "user", "content": file.read()}])
|
|
100
|
+
result = response.choices[0].message.content
|
|
101
|
+
return result.lower().startswith("success")
|
|
102
|
+
|
|
103
|
+
chain = (
|
|
104
|
+
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
|
|
105
|
+
.settings(parallel=4, cache=True)
|
|
106
|
+
.map(is_success=eval_dialogue)
|
|
107
|
+
.save("mistral_files")
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
successful_chain = chain.filter(Column("is_success") == True)
|
|
111
|
+
successful_chain.export_files("./output_mistral")
|
|
112
|
+
|
|
113
|
+
print(f"{successful_chain.count()} files were exported")
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
With the instruction above, the Mistral model considers 31/50 files to hold the successful dialogues:
|
|
118
|
+
|
|
119
|
+
.. code:: shell
|
|
120
|
+
|
|
121
|
+
$ ls output_mistral/datachain-demo/chatbot-KiT/
|
|
122
|
+
1.txt 15.txt 18.txt 2.txt 22.txt 25.txt 28.txt 33.txt 37.txt 4.txt 41.txt ...
|
|
123
|
+
$ ls output_mistral/datachain-demo/chatbot-KiT/ | wc -l
|
|
124
|
+
31
|
|
125
|
+
|
|
126
|
+
|
|
45
127
|
Key Features
|
|
46
128
|
============
|
|
47
129
|
|
|
@@ -39,8 +39,8 @@ using JSON metadata:
|
|
|
39
39
|
``` py
|
|
40
40
|
from datachain import Column, DataChain
|
|
41
41
|
|
|
42
|
-
meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta")
|
|
43
|
-
images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg")
|
|
42
|
+
meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
|
|
43
|
+
images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
|
|
44
44
|
|
|
45
45
|
images_id = images.map(id=lambda file: file.path.split('.')[-2])
|
|
46
46
|
annotated = images_id.merge(meta, on="id", right_on="meta.id")
|
|
@@ -59,6 +59,8 @@ Batch inference with a simple sentiment model using the
|
|
|
59
59
|
pip install transformers
|
|
60
60
|
```
|
|
61
61
|
|
|
62
|
+
Note, `transformers` works only if `torch`, `tensorflow` >= 2.0, or `flax` are installed.
|
|
63
|
+
|
|
62
64
|
The code below downloads files from the cloud, and applies a
|
|
63
65
|
user-defined function to each one of them. All files with a positive
|
|
64
66
|
sentiment detected are then copied to the local directory.
|
|
@@ -76,7 +78,7 @@ def is_positive_dialogue_ending(file) -> bool:
|
|
|
76
78
|
|
|
77
79
|
chain = (
|
|
78
80
|
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/",
|
|
79
|
-
object_name="file", type="text")
|
|
81
|
+
object_name="file", type="text", anon=True)
|
|
80
82
|
.settings(parallel=8, cache=True)
|
|
81
83
|
.map(is_positive=is_positive_dialogue_ending)
|
|
82
84
|
.save("file_response")
|
|
@@ -114,13 +116,14 @@ DataChain can parallelize API calls; the free Mistral tier supports up
|
|
|
114
116
|
to 4 requests at the same time.
|
|
115
117
|
|
|
116
118
|
``` py
|
|
119
|
+
import os
|
|
117
120
|
from mistralai import Mistral
|
|
118
121
|
from datachain import File, DataChain, Column
|
|
119
122
|
|
|
120
123
|
PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
|
|
121
124
|
|
|
122
125
|
def eval_dialogue(file: File) -> bool:
|
|
123
|
-
client = Mistral()
|
|
126
|
+
client = Mistral(api_key = os.environ["MISTRAL_API_KEY"])
|
|
124
127
|
response = client.chat.complete(
|
|
125
128
|
model="open-mixtral-8x22b",
|
|
126
129
|
messages=[{"role": "system", "content": PROMPT},
|
|
@@ -129,8 +132,7 @@ def eval_dialogue(file: File) -> bool:
|
|
|
129
132
|
return result.lower().startswith("success")
|
|
130
133
|
|
|
131
134
|
chain = (
|
|
132
|
-
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
|
|
133
|
-
.settings(parallel=4, cache=True)
|
|
135
|
+
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
|
|
134
136
|
.map(is_success=eval_dialogue)
|
|
135
137
|
.save("mistral_files")
|
|
136
138
|
)
|
|
@@ -175,7 +177,7 @@ def eval_dialog(file: File) -> ChatCompletionResponse:
|
|
|
175
177
|
{"role": "user", "content": file.read()}])
|
|
176
178
|
|
|
177
179
|
chain = (
|
|
178
|
-
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
|
|
180
|
+
DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
|
|
179
181
|
.settings(parallel=4, cache=True)
|
|
180
182
|
.map(response=eval_dialog)
|
|
181
183
|
.map(status=lambda response: response.choices[0].message.content.lower()[:7])
|
|
@@ -271,7 +273,7 @@ from datachain import C, DataChain
|
|
|
271
273
|
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
|
272
274
|
|
|
273
275
|
chain = (
|
|
274
|
-
DataChain.from_storage("gs://datachain-demo/dogs-and-cats/", type="image")
|
|
276
|
+
DataChain.from_storage("gs://datachain-demo/dogs-and-cats/", type="image", anon=True)
|
|
275
277
|
.map(label=lambda name: name.split(".")[0], params=["file.name"])
|
|
276
278
|
.select("file", "label").to_pytorch(
|
|
277
279
|
transform=processor.image_processor,
|
|
@@ -96,7 +96,7 @@ tests = [
|
|
|
96
96
|
]
|
|
97
97
|
dev = [
|
|
98
98
|
"datachain[docs,tests]",
|
|
99
|
-
"mypy==1.
|
|
99
|
+
"mypy==1.14.0",
|
|
100
100
|
"types-python-dateutil",
|
|
101
101
|
"types-pytz",
|
|
102
102
|
"types-PyYAML",
|
|
@@ -112,7 +112,7 @@ examples = [
|
|
|
112
112
|
"pdfplumber==0.11.4",
|
|
113
113
|
"huggingface_hub[hf_transfer]",
|
|
114
114
|
"onnx==1.16.1",
|
|
115
|
-
"ultralytics==8.3.
|
|
115
|
+
"ultralytics==8.3.53"
|
|
116
116
|
]
|
|
117
117
|
|
|
118
118
|
[project.urls]
|
|
@@ -52,6 +52,7 @@ from datachain.error import (
|
|
|
52
52
|
QueryScriptCancelError,
|
|
53
53
|
QueryScriptRunError,
|
|
54
54
|
)
|
|
55
|
+
from datachain.lib.listing import get_listing
|
|
55
56
|
from datachain.node import DirType, Node, NodeWithPath
|
|
56
57
|
from datachain.nodes_thread_pool import NodesThreadPool
|
|
57
58
|
from datachain.remote.studio import StudioClient
|
|
@@ -599,7 +600,7 @@ class Catalog:
|
|
|
599
600
|
source, session=self.session, update=update, object_name=object_name
|
|
600
601
|
)
|
|
601
602
|
|
|
602
|
-
list_ds_name, list_uri, list_path, _ =
|
|
603
|
+
list_ds_name, list_uri, list_path, _ = get_listing(
|
|
603
604
|
source, self.session, update=update
|
|
604
605
|
)
|
|
605
606
|
|
|
@@ -697,11 +698,9 @@ class Catalog:
|
|
|
697
698
|
)
|
|
698
699
|
indexed_sources = []
|
|
699
700
|
for source in dataset_sources:
|
|
700
|
-
from datachain.lib.dc import DataChain
|
|
701
|
-
|
|
702
701
|
client = self.get_client(source, **client_config)
|
|
703
702
|
uri = client.uri
|
|
704
|
-
dataset_name, _, _, _ =
|
|
703
|
+
dataset_name, _, _, _ = get_listing(uri, self.session)
|
|
705
704
|
listing = Listing(
|
|
706
705
|
self.metastore.clone(),
|
|
707
706
|
self.warehouse.clone(),
|
|
@@ -32,6 +32,16 @@ class GCSClient(Client):
|
|
|
32
32
|
|
|
33
33
|
return cast(GCSFileSystem, super().create_fs(**kwargs))
|
|
34
34
|
|
|
35
|
+
def url(self, path: str, expires: int = 3600, **kwargs) -> str:
|
|
36
|
+
"""
|
|
37
|
+
Generate a signed URL for the given path.
|
|
38
|
+
If the client is anonymous, a public URL is returned instead
|
|
39
|
+
(see https://cloud.google.com/storage/docs/access-public-data#api-link).
|
|
40
|
+
"""
|
|
41
|
+
if self.fs.storage_options.get("token") == "anon":
|
|
42
|
+
return f"https://storage.googleapis.com/{self.name}/{path}"
|
|
43
|
+
return self.fs.sign(self.get_full_path(path), expiration=expires, **kwargs)
|
|
44
|
+
|
|
35
45
|
@staticmethod
|
|
36
46
|
def parse_timestamp(timestamp: str) -> datetime:
|
|
37
47
|
"""
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
from collections.abc import Sequence
|
|
2
|
-
from
|
|
2
|
+
from itertools import islice
|
|
3
3
|
from typing import TYPE_CHECKING, Any, Optional
|
|
4
4
|
|
|
5
|
+
import fsspec.implementations.reference
|
|
5
6
|
import orjson
|
|
6
7
|
import pyarrow as pa
|
|
8
|
+
from fsspec.core import split_protocol
|
|
7
9
|
from pyarrow.dataset import CsvFileFormat, dataset
|
|
8
10
|
from tqdm import tqdm
|
|
9
11
|
|
|
@@ -25,7 +27,18 @@ if TYPE_CHECKING:
|
|
|
25
27
|
DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY = b"DataChain SignalSchema"
|
|
26
28
|
|
|
27
29
|
|
|
30
|
+
class ReferenceFileSystem(fsspec.implementations.reference.ReferenceFileSystem):
|
|
31
|
+
def _open(self, path, mode="rb", *args, **kwargs):
|
|
32
|
+
# overriding because `fsspec`'s `ReferenceFileSystem._open`
|
|
33
|
+
# reads the whole file in-memory.
|
|
34
|
+
(uri,) = self.references[path]
|
|
35
|
+
protocol, _ = split_protocol(uri)
|
|
36
|
+
return self.fss[protocol]._open(uri, mode, *args, **kwargs)
|
|
37
|
+
|
|
38
|
+
|
|
28
39
|
class ArrowGenerator(Generator):
|
|
40
|
+
DEFAULT_BATCH_SIZE = 2**17 # same as `pyarrow._dataset._DEFAULT_BATCH_SIZE`
|
|
41
|
+
|
|
29
42
|
def __init__(
|
|
30
43
|
self,
|
|
31
44
|
input_schema: Optional["pa.Schema"] = None,
|
|
@@ -55,57 +68,80 @@ class ArrowGenerator(Generator):
|
|
|
55
68
|
def process(self, file: File):
|
|
56
69
|
if file._caching_enabled:
|
|
57
70
|
file.ensure_cached()
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
path = _nrows_file(file, self.nrows)
|
|
62
|
-
ds = dataset(path, schema=self.input_schema, **self.kwargs)
|
|
71
|
+
cache_path = file.get_local_path()
|
|
72
|
+
fs_path = file.path
|
|
73
|
+
fs = ReferenceFileSystem({fs_path: [cache_path]})
|
|
63
74
|
else:
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
75
|
+
fs, fs_path = file.get_fs(), file.get_path()
|
|
76
|
+
|
|
77
|
+
ds = dataset(fs_path, schema=self.input_schema, filesystem=fs, **self.kwargs)
|
|
78
|
+
|
|
68
79
|
hf_schema = _get_hf_schema(ds.schema)
|
|
69
80
|
use_datachain_schema = (
|
|
70
81
|
bool(ds.schema.metadata)
|
|
71
82
|
and DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY in ds.schema.metadata
|
|
72
83
|
)
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
84
|
+
|
|
85
|
+
kw = {}
|
|
86
|
+
if self.nrows:
|
|
87
|
+
kw = {"batch_size": min(self.DEFAULT_BATCH_SIZE, self.nrows)}
|
|
88
|
+
|
|
89
|
+
def iter_records():
|
|
90
|
+
for record_batch in ds.to_batches(**kw):
|
|
91
|
+
yield from record_batch.to_pylist()
|
|
92
|
+
|
|
93
|
+
it = islice(iter_records(), self.nrows)
|
|
94
|
+
with tqdm(it, desc="Parsed by pyarrow", unit="rows", total=self.nrows) as pbar:
|
|
95
|
+
for index, record in enumerate(pbar):
|
|
96
|
+
yield self._process_record(
|
|
97
|
+
record, file, index, hf_schema, use_datachain_schema
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
def _process_record(
|
|
101
|
+
self,
|
|
102
|
+
record: dict[str, Any],
|
|
103
|
+
file: File,
|
|
104
|
+
index: int,
|
|
105
|
+
hf_schema: Optional[tuple["Features", dict[str, "DataType"]]],
|
|
106
|
+
use_datachain_schema: bool,
|
|
107
|
+
):
|
|
108
|
+
if use_datachain_schema and self.output_schema:
|
|
109
|
+
vals = [_nested_model_instantiate(record, self.output_schema)]
|
|
110
|
+
else:
|
|
111
|
+
vals = self._process_non_datachain_record(record, hf_schema)
|
|
112
|
+
|
|
113
|
+
if self.source:
|
|
114
|
+
kwargs: dict = self.kwargs
|
|
115
|
+
# Can't serialize CsvFileFormat; may lose formatting options.
|
|
116
|
+
if isinstance(kwargs.get("format"), CsvFileFormat):
|
|
117
|
+
kwargs["format"] = "csv"
|
|
118
|
+
arrow_file = ArrowRow(file=file, index=index, kwargs=kwargs)
|
|
119
|
+
return [arrow_file, *vals]
|
|
120
|
+
return vals
|
|
121
|
+
|
|
122
|
+
def _process_non_datachain_record(
|
|
123
|
+
self,
|
|
124
|
+
record: dict[str, Any],
|
|
125
|
+
hf_schema: Optional[tuple["Features", dict[str, "DataType"]]],
|
|
126
|
+
):
|
|
127
|
+
vals = list(record.values())
|
|
128
|
+
if not self.output_schema:
|
|
129
|
+
return vals
|
|
130
|
+
|
|
131
|
+
fields = self.output_schema.model_fields
|
|
132
|
+
vals_dict = {}
|
|
133
|
+
for i, ((field, field_info), val) in enumerate(zip(fields.items(), vals)):
|
|
134
|
+
anno = field_info.annotation
|
|
135
|
+
if hf_schema:
|
|
136
|
+
from datachain.lib.hf import convert_feature
|
|
137
|
+
|
|
138
|
+
feat = list(hf_schema[0].values())[i]
|
|
139
|
+
vals_dict[field] = convert_feature(val, feat, anno)
|
|
140
|
+
elif ModelStore.is_pydantic(anno):
|
|
141
|
+
vals_dict[field] = anno(**val) # type: ignore[misc]
|
|
142
|
+
else:
|
|
143
|
+
vals_dict[field] = val
|
|
144
|
+
return [self.output_schema(**vals_dict)]
|
|
109
145
|
|
|
110
146
|
|
|
111
147
|
def infer_schema(chain: "DataChain", **kwargs) -> pa.Schema:
|
|
@@ -190,18 +226,6 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa:
|
|
|
190
226
|
raise TypeError(f"{col_type!r} datatypes not supported, column: {column}")
|
|
191
227
|
|
|
192
228
|
|
|
193
|
-
def _nrows_file(file: File, nrows: int) -> str:
|
|
194
|
-
tf = NamedTemporaryFile(delete=False) # noqa: SIM115
|
|
195
|
-
with file.open(mode="r") as reader:
|
|
196
|
-
with open(tf.name, "a") as writer:
|
|
197
|
-
for row, line in enumerate(reader):
|
|
198
|
-
if row >= nrows:
|
|
199
|
-
break
|
|
200
|
-
writer.write(line)
|
|
201
|
-
writer.write("\n")
|
|
202
|
-
return tf.name
|
|
203
|
-
|
|
204
|
-
|
|
205
229
|
def _get_hf_schema(
|
|
206
230
|
schema: "pa.Schema",
|
|
207
231
|
) -> Optional[tuple["Features", dict[str, "DataType"]]]:
|