datachain 0.7.9__tar.gz → 0.7.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.7.9 → datachain-0.7.11}/.github/workflows/tests.yml +33 -5
- {datachain-0.7.9 → datachain-0.7.11}/.pre-commit-config.yaml +1 -1
- datachain-0.7.11/PKG-INFO +206 -0
- datachain-0.7.11/README.rst +104 -0
- datachain-0.7.11/docs/contributing.md +115 -0
- datachain-0.7.11/docs/css/github-permalink-style.css +39 -0
- datachain-0.7.9/docs/index.md → datachain-0.7.11/docs/examples.md +51 -61
- datachain-0.7.11/docs/index.md +106 -0
- datachain-0.7.11/docs/quick-start.md +290 -0
- datachain-0.7.11/docs/references/index.md +14 -0
- datachain-0.7.11/docs/tutorials.md +9 -0
- {datachain-0.7.9 → datachain-0.7.11}/examples/get_started/torch-loader.py +25 -20
- {datachain-0.7.9 → datachain-0.7.11}/examples/llm_and_nlp/unstructured-embeddings-gen.py +7 -5
- {datachain-0.7.9 → datachain-0.7.11}/mkdocs.yml +26 -17
- {datachain-0.7.9 → datachain-0.7.11}/noxfile.py +2 -0
- {datachain-0.7.9 → datachain-0.7.11}/pyproject.toml +3 -3
- datachain-0.7.11/src/datachain/client/__init__.py +3 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/client/fsspec.py +4 -2
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/client/local.py +9 -4
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/func/__init__.py +4 -1
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/func/numeric.py +46 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/func/string.py +46 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/convert/flatten.py +7 -5
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/convert/unflatten.py +2 -2
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/convert/values_to_tuples.py +1 -1
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/dc.py +5 -1
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/file.py +2 -1
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/meta_formats.py +2 -1
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/pytorch.py +1 -5
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/signal_schema.py +28 -6
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/utils.py +1 -1
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/query/dataset.py +5 -2
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/functions/numeric.py +12 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/functions/string.py +12 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/sqlite/base.py +40 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/toolkit/split.py +19 -6
- datachain-0.7.11/src/datachain.egg-info/PKG-INFO +206 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain.egg-info/SOURCES.txt +5 -1
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain.egg-info/requires.txt +3 -3
- {datachain-0.7.9 → datachain-0.7.11}/tests/conftest.py +12 -10
- {datachain-0.7.9 → datachain-0.7.11}/tests/examples/test_examples.py +14 -29
- datachain-0.7.11/tests/func/test_toolkit.py +51 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_signal_schema.py +5 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_func.py +60 -2
- datachain-0.7.9/CONTRIBUTING.rst +0 -129
- datachain-0.7.9/PKG-INFO +0 -488
- datachain-0.7.9/README.rst +0 -386
- datachain-0.7.9/docs/references/index.md +0 -8
- datachain-0.7.9/src/datachain/client/__init__.py +0 -4
- datachain-0.7.9/src/datachain.egg-info/PKG-INFO +0 -488
- datachain-0.7.9/tests/func/test_toolkit.py +0 -42
- {datachain-0.7.9 → datachain-0.7.11}/.cruft.json +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/.gitattributes +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/.github/codecov.yaml +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/.github/dependabot.yml +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/.github/workflows/release.yml +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/.gitignore +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/LICENSE +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/docs/assets/datachain.svg +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/docs/overrides/main.html +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/docs/references/datachain.md +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/docs/references/datatype.md +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/docs/references/file.md +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/docs/references/sql.md +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/docs/references/torch.md +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/docs/references/udf.md +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/examples/multimodal/wds.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/setup.cfg +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/__init__.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/__main__.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/asyn.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/cache.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/cli.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/cli_utils.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/client/azure.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/client/gcs.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/client/hf.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/client/s3.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/config.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/dataset.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/error.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/func/array.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/func/base.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/func/conditional.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/func/func.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/func/path.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/func/random.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/func/window.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/job.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/clip.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/hf.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/image.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/listing.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/settings.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/tar.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/text.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/udf.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/listing.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/model/__init__.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/model/bbox.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/model/pose.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/model/segment.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/node.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/progress.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/py.typed +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/query/__init__.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/query/batch.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/query/metrics.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/query/params.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/query/queue.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/query/schema.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/query/session.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/remote/studio.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/types.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/sql/utils.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/studio.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/telemetry.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain/utils.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/__init__.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/data.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/examples/__init__.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/examples/wds_data.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/func/__init__.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/func/test_catalog.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/func/test_client.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/func/test_datachain.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/func/test_datasets.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/func/test_listing.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/func/test_ls.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/func/test_metrics.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/func/test_pull.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/func/test_pytorch.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/func/test_query.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/scripts/feature_class.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/test_atomicity.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/test_cli_e2e.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/test_cli_studio.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/test_query_e2e.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/test_telemetry.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/__init__.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_models.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_asyn.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_cache.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_catalog.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_client.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_config.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_dataset.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_listing.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_metastore.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_query.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_query_params.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_serializer.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_session.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_utils.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.7.9 → datachain-0.7.11}/tests/utils.py +0 -0
|
@@ -3,7 +3,7 @@ name: Tests
|
|
|
3
3
|
on:
|
|
4
4
|
push:
|
|
5
5
|
branches: [main]
|
|
6
|
-
|
|
6
|
+
pull_request_target:
|
|
7
7
|
workflow_dispatch:
|
|
8
8
|
|
|
9
9
|
env:
|
|
@@ -14,13 +14,22 @@ concurrency:
|
|
|
14
14
|
cancel-in-progress: true
|
|
15
15
|
|
|
16
16
|
jobs:
|
|
17
|
+
authorize:
|
|
18
|
+
environment: ${{ github.event_name == 'pull_request_target' && github.event.pull_request.head.repo.full_name != github.repository && 'external' || 'internal' }}
|
|
19
|
+
runs-on: ubuntu-latest
|
|
20
|
+
steps:
|
|
21
|
+
- run: true
|
|
22
|
+
|
|
17
23
|
lint:
|
|
24
|
+
needs: authorize
|
|
25
|
+
|
|
18
26
|
runs-on: ubuntu-latest
|
|
19
27
|
steps:
|
|
20
28
|
- name: Check out the repository
|
|
21
29
|
uses: actions/checkout@v4
|
|
22
30
|
with:
|
|
23
31
|
fetch-depth: 0
|
|
32
|
+
ref: ${{ github.event.pull_request.head.sha || github.ref }}
|
|
24
33
|
|
|
25
34
|
- name: Set up Python 3.9
|
|
26
35
|
uses: actions/setup-python@v5
|
|
@@ -53,6 +62,8 @@ jobs:
|
|
|
53
62
|
run: nox -s lint
|
|
54
63
|
|
|
55
64
|
datachain:
|
|
65
|
+
needs: authorize
|
|
66
|
+
|
|
56
67
|
timeout-minutes: 40
|
|
57
68
|
runs-on: ${{ matrix.os }}
|
|
58
69
|
strategy:
|
|
@@ -75,6 +86,7 @@ jobs:
|
|
|
75
86
|
uses: actions/checkout@v4
|
|
76
87
|
with:
|
|
77
88
|
fetch-depth: 0
|
|
89
|
+
ref: ${{ github.event.pull_request.head.sha || github.ref }}
|
|
78
90
|
|
|
79
91
|
- name: Set up Python ${{ matrix.pyv }}
|
|
80
92
|
uses: actions/setup-python@v5
|
|
@@ -117,12 +129,14 @@ jobs:
|
|
|
117
129
|
run: nox -s docs
|
|
118
130
|
|
|
119
131
|
examples:
|
|
132
|
+
needs: authorize
|
|
133
|
+
|
|
120
134
|
runs-on: ${{ matrix.os }}
|
|
121
135
|
timeout-minutes: 60
|
|
122
136
|
strategy:
|
|
123
137
|
fail-fast: false
|
|
124
138
|
matrix:
|
|
125
|
-
os: [ubuntu-latest,
|
|
139
|
+
os: [ubuntu-latest, windows-latest]
|
|
126
140
|
pyv: ['3.9', '3.12']
|
|
127
141
|
group: ['get_started', 'llm_and_nlp or computer_vision', 'multimodal']
|
|
128
142
|
exclude:
|
|
@@ -132,9 +146,10 @@ jobs:
|
|
|
132
146
|
- {os: ubuntu-latest-4-cores, pyv: "3.9", group: multimodal}
|
|
133
147
|
- {os: ubuntu-latest-4-cores, pyv: "3.12", group: multimodal}
|
|
134
148
|
|
|
135
|
-
|
|
136
149
|
steps:
|
|
137
150
|
- uses: actions/checkout@v4
|
|
151
|
+
with:
|
|
152
|
+
ref: ${{ github.event.pull_request.head.sha || github.ref }}
|
|
138
153
|
|
|
139
154
|
- name: Set up Python ${{ matrix.pyv }}
|
|
140
155
|
uses: actions/setup-python@v5
|
|
@@ -151,7 +166,20 @@ jobs:
|
|
|
151
166
|
- name: Install nox
|
|
152
167
|
run: uv pip install nox --system
|
|
153
168
|
|
|
169
|
+
# HF runs against actual API - thus run it only once
|
|
170
|
+
- name: Set hf token
|
|
171
|
+
if: matrix.os == 'ubuntu-latest' && matrix.pyv == '3.12'
|
|
172
|
+
run: echo 'HF_TOKEN=${{ secrets.HF_TOKEN }}' >> "$GITHUB_ENV"
|
|
173
|
+
|
|
154
174
|
- name: Run examples
|
|
155
|
-
env:
|
|
156
|
-
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
|
157
175
|
run: nox -s examples -p ${{ matrix.pyv }} -- -m "${{ matrix.group }}"
|
|
176
|
+
|
|
177
|
+
check:
|
|
178
|
+
if: always()
|
|
179
|
+
needs: [lint, datachain, examples]
|
|
180
|
+
runs-on: ubuntu-latest
|
|
181
|
+
steps:
|
|
182
|
+
- uses: re-actors/alls-green@release/v1
|
|
183
|
+
with:
|
|
184
|
+
allowed-failures: examples
|
|
185
|
+
jobs: ${{ toJSON(needs) }}
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: datachain
|
|
3
|
+
Version: 0.7.11
|
|
4
|
+
Summary: Wrangle unstructured AI data at scale
|
|
5
|
+
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Documentation, https://datachain.dvc.ai
|
|
8
|
+
Project-URL: Issues, https://github.com/iterative/datachain/issues
|
|
9
|
+
Project-URL: Source, https://github.com/iterative/datachain
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
16
|
+
Requires-Python: >=3.9
|
|
17
|
+
Description-Content-Type: text/x-rst
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: pyyaml
|
|
20
|
+
Requires-Dist: tomlkit
|
|
21
|
+
Requires-Dist: tqdm
|
|
22
|
+
Requires-Dist: numpy<3,>=1
|
|
23
|
+
Requires-Dist: pandas>=2.0.0
|
|
24
|
+
Requires-Dist: pyarrow
|
|
25
|
+
Requires-Dist: typing-extensions
|
|
26
|
+
Requires-Dist: python-dateutil>=2
|
|
27
|
+
Requires-Dist: attrs>=21.3.0
|
|
28
|
+
Requires-Dist: s3fs>=2024.2.0
|
|
29
|
+
Requires-Dist: gcsfs>=2024.2.0
|
|
30
|
+
Requires-Dist: adlfs>=2024.2.0
|
|
31
|
+
Requires-Dist: dvc-data<4,>=3.10
|
|
32
|
+
Requires-Dist: dvc-objects<6,>=4
|
|
33
|
+
Requires-Dist: shtab<2,>=1.3.4
|
|
34
|
+
Requires-Dist: sqlalchemy>=2
|
|
35
|
+
Requires-Dist: multiprocess==0.70.16
|
|
36
|
+
Requires-Dist: cloudpickle
|
|
37
|
+
Requires-Dist: orjson>=3.10.5
|
|
38
|
+
Requires-Dist: pydantic<3,>=2
|
|
39
|
+
Requires-Dist: jmespath>=1.0
|
|
40
|
+
Requires-Dist: datamodel-code-generator>=0.25
|
|
41
|
+
Requires-Dist: Pillow<12,>=10.0.0
|
|
42
|
+
Requires-Dist: msgpack<2,>=1.0.4
|
|
43
|
+
Requires-Dist: psutil
|
|
44
|
+
Requires-Dist: huggingface_hub
|
|
45
|
+
Requires-Dist: iterative-telemetry>=0.0.9
|
|
46
|
+
Requires-Dist: platformdirs
|
|
47
|
+
Requires-Dist: dvc-studio-client<1,>=0.21
|
|
48
|
+
Requires-Dist: tabulate
|
|
49
|
+
Provides-Extra: docs
|
|
50
|
+
Requires-Dist: mkdocs>=1.5.2; extra == "docs"
|
|
51
|
+
Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
|
|
52
|
+
Requires-Dist: mkdocs-material>=9.3.1; extra == "docs"
|
|
53
|
+
Requires-Dist: mkdocs-section-index>=0.3.6; extra == "docs"
|
|
54
|
+
Requires-Dist: mkdocstrings-python>=1.6.3; extra == "docs"
|
|
55
|
+
Requires-Dist: mkdocs-literate-nav>=0.6.1; extra == "docs"
|
|
56
|
+
Provides-Extra: torch
|
|
57
|
+
Requires-Dist: torch>=2.1.0; extra == "torch"
|
|
58
|
+
Requires-Dist: torchvision; extra == "torch"
|
|
59
|
+
Requires-Dist: transformers>=4.36.0; extra == "torch"
|
|
60
|
+
Provides-Extra: remote
|
|
61
|
+
Requires-Dist: lz4; extra == "remote"
|
|
62
|
+
Requires-Dist: requests>=2.22.0; extra == "remote"
|
|
63
|
+
Provides-Extra: vector
|
|
64
|
+
Requires-Dist: usearch; extra == "vector"
|
|
65
|
+
Provides-Extra: hf
|
|
66
|
+
Requires-Dist: numba>=0.60.0; extra == "hf"
|
|
67
|
+
Requires-Dist: datasets[audio,vision]>=2.21.0; extra == "hf"
|
|
68
|
+
Provides-Extra: tests
|
|
69
|
+
Requires-Dist: datachain[hf,remote,torch,vector]; extra == "tests"
|
|
70
|
+
Requires-Dist: pytest<9,>=8; extra == "tests"
|
|
71
|
+
Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
|
|
72
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
|
|
73
|
+
Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
|
|
74
|
+
Requires-Dist: pytest-servers[all]>=0.5.8; extra == "tests"
|
|
75
|
+
Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
|
|
76
|
+
Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
|
|
77
|
+
Requires-Dist: virtualenv; extra == "tests"
|
|
78
|
+
Requires-Dist: dulwich; extra == "tests"
|
|
79
|
+
Requires-Dist: hypothesis; extra == "tests"
|
|
80
|
+
Requires-Dist: open_clip_torch; extra == "tests"
|
|
81
|
+
Requires-Dist: aiotools>=1.7.0; extra == "tests"
|
|
82
|
+
Requires-Dist: requests-mock; extra == "tests"
|
|
83
|
+
Requires-Dist: scipy; extra == "tests"
|
|
84
|
+
Provides-Extra: dev
|
|
85
|
+
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
86
|
+
Requires-Dist: mypy==1.13.0; extra == "dev"
|
|
87
|
+
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
88
|
+
Requires-Dist: types-pytz; extra == "dev"
|
|
89
|
+
Requires-Dist: types-PyYAML; extra == "dev"
|
|
90
|
+
Requires-Dist: types-requests; extra == "dev"
|
|
91
|
+
Requires-Dist: types-tabulate; extra == "dev"
|
|
92
|
+
Provides-Extra: examples
|
|
93
|
+
Requires-Dist: datachain[tests]; extra == "examples"
|
|
94
|
+
Requires-Dist: defusedxml; extra == "examples"
|
|
95
|
+
Requires-Dist: accelerate; extra == "examples"
|
|
96
|
+
Requires-Dist: unstructured_ingest[embed-huggingface]; extra == "examples"
|
|
97
|
+
Requires-Dist: unstructured[pdf]; extra == "examples"
|
|
98
|
+
Requires-Dist: pdfplumber==0.11.4; extra == "examples"
|
|
99
|
+
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
100
|
+
Requires-Dist: onnx==1.16.1; extra == "examples"
|
|
101
|
+
Requires-Dist: ultralytics==8.3.48; extra == "examples"
|
|
102
|
+
|
|
103
|
+
================
|
|
104
|
+
|logo| DataChain
|
|
105
|
+
================
|
|
106
|
+
|
|
107
|
+
|PyPI| |Python Version| |Codecov| |Tests|
|
|
108
|
+
|
|
109
|
+
.. |logo| image:: docs/assets/datachain.svg
|
|
110
|
+
:height: 24
|
|
111
|
+
.. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
|
|
112
|
+
:target: https://pypi.org/project/datachain/
|
|
113
|
+
:alt: PyPI
|
|
114
|
+
.. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
|
|
115
|
+
:target: https://pypi.org/project/datachain
|
|
116
|
+
:alt: Python Version
|
|
117
|
+
.. |Codecov| image:: https://codecov.io/gh/iterative/datachain/graph/badge.svg?token=byliXGGyGB
|
|
118
|
+
:target: https://codecov.io/gh/iterative/datachain
|
|
119
|
+
:alt: Codecov
|
|
120
|
+
.. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
|
|
121
|
+
:target: https://github.com/iterative/datachain/actions/workflows/tests.yml
|
|
122
|
+
:alt: Tests
|
|
123
|
+
|
|
124
|
+
DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
|
|
125
|
+
data like images, audio, videos, text and PDFs. It integrates with external storage
|
|
126
|
+
(e.g. S3) to process data efficiently without data duplication and manages metadata
|
|
127
|
+
in an internal database for easy and efficient querying.
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
Use Cases
|
|
131
|
+
=========
|
|
132
|
+
|
|
133
|
+
1. **ETL.** Pythonic framework for describing and running unstructured data transformations
|
|
134
|
+
and enrichments, applying models to data, including LLMs.
|
|
135
|
+
2. **Analytics.** DataChain dataset is a table that combines all the information about data
|
|
136
|
+
objects in one place + it provides dataframe-like API and vecrorized engine to do analytics
|
|
137
|
+
on these tables at scale.
|
|
138
|
+
3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
|
|
139
|
+
Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
|
|
140
|
+
|
|
141
|
+
Getting Started
|
|
142
|
+
===============
|
|
143
|
+
|
|
144
|
+
Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ and `Docs <https://docs.datachain.ai/>`_
|
|
145
|
+
to get started with `DataChain` and learn more.
|
|
146
|
+
|
|
147
|
+
Key Features
|
|
148
|
+
============
|
|
149
|
+
|
|
150
|
+
📂 **Multimodal Dataset Versioning.**
|
|
151
|
+
- Version unstructured data without moving or creating data copies, by supporting
|
|
152
|
+
references to S3, GCP, Azure, and local file systems.
|
|
153
|
+
- Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet, etc.
|
|
154
|
+
- Unite files and metadata together into persistent, versioned, columnar datasets.
|
|
155
|
+
|
|
156
|
+
🐍 **Python-friendly.**
|
|
157
|
+
- Operate on Python objects and object fields: float scores, strings, matrixes,
|
|
158
|
+
LLM response objects.
|
|
159
|
+
- Run Python code in a high-scale, terabytes size datasets, with built-in
|
|
160
|
+
parallelization and memory-efficient computing — no SQL or Spark required.
|
|
161
|
+
|
|
162
|
+
🧠 **Data Enrichment and Processing.**
|
|
163
|
+
- Generate metadata using local AI models and LLM APIs.
|
|
164
|
+
- Filter, join, and group datasets by metadata. Search by vector embeddings.
|
|
165
|
+
- High-performance vectorized operations on Python objects: sum, count, avg, etc.
|
|
166
|
+
- Pass datasets to Pytorch and Tensorflow, or export them back into storage.
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
Contributing
|
|
170
|
+
============
|
|
171
|
+
|
|
172
|
+
Contributions are very welcome. To learn more, see the `Contributor Guide`_.
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
Community and Support
|
|
176
|
+
=====================
|
|
177
|
+
|
|
178
|
+
* `Docs <https://docs.datachain.ai/>`_
|
|
179
|
+
* `File an issue`_ if you encounter any problems
|
|
180
|
+
* `Discord Chat <https://dvc.org/chat>`_
|
|
181
|
+
* `Email <mailto:support@dvc.org>`_
|
|
182
|
+
* `Twitter <https://twitter.com/DVCorg>`_
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
DataChain Studio Platform
|
|
186
|
+
=========================
|
|
187
|
+
|
|
188
|
+
`DataChain Studio`_ is a proprietary solution for teams that offers:
|
|
189
|
+
|
|
190
|
+
- **Centralized dataset registry** to manage data, code and dependency
|
|
191
|
+
dependencies in one place.
|
|
192
|
+
- **Data Lineage** for data sources as well as derivative dataset.
|
|
193
|
+
- **UI for Multimodal Data** like images, videos, and PDFs.
|
|
194
|
+
- **Scalable Compute** to handle large datasets (100M+ files) and in-house
|
|
195
|
+
AI model inference.
|
|
196
|
+
- **Access control** including SSO and team based collaboration.
|
|
197
|
+
|
|
198
|
+
.. _PyPI: https://pypi.org/
|
|
199
|
+
.. _file an issue: https://github.com/iterative/datachain/issues
|
|
200
|
+
.. github-only
|
|
201
|
+
.. _Contributor Guide: https://docs.datachain.ai/contributing
|
|
202
|
+
.. _Pydantic: https://github.com/pydantic/pydantic
|
|
203
|
+
.. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
|
|
204
|
+
.. _SQLite: https://www.sqlite.org/
|
|
205
|
+
.. _Getting Started: https://docs.datachain.ai/
|
|
206
|
+
.. _DataChain Studio: https://studio.datachain.ai/
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
================
|
|
2
|
+
|logo| DataChain
|
|
3
|
+
================
|
|
4
|
+
|
|
5
|
+
|PyPI| |Python Version| |Codecov| |Tests|
|
|
6
|
+
|
|
7
|
+
.. |logo| image:: docs/assets/datachain.svg
|
|
8
|
+
:height: 24
|
|
9
|
+
.. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
|
|
10
|
+
:target: https://pypi.org/project/datachain/
|
|
11
|
+
:alt: PyPI
|
|
12
|
+
.. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
|
|
13
|
+
:target: https://pypi.org/project/datachain
|
|
14
|
+
:alt: Python Version
|
|
15
|
+
.. |Codecov| image:: https://codecov.io/gh/iterative/datachain/graph/badge.svg?token=byliXGGyGB
|
|
16
|
+
:target: https://codecov.io/gh/iterative/datachain
|
|
17
|
+
:alt: Codecov
|
|
18
|
+
.. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
|
|
19
|
+
:target: https://github.com/iterative/datachain/actions/workflows/tests.yml
|
|
20
|
+
:alt: Tests
|
|
21
|
+
|
|
22
|
+
DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
|
|
23
|
+
data like images, audio, videos, text and PDFs. It integrates with external storage
|
|
24
|
+
(e.g. S3) to process data efficiently without data duplication and manages metadata
|
|
25
|
+
in an internal database for easy and efficient querying.
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
Use Cases
|
|
29
|
+
=========
|
|
30
|
+
|
|
31
|
+
1. **ETL.** Pythonic framework for describing and running unstructured data transformations
|
|
32
|
+
and enrichments, applying models to data, including LLMs.
|
|
33
|
+
2. **Analytics.** DataChain dataset is a table that combines all the information about data
|
|
34
|
+
objects in one place + it provides dataframe-like API and vecrorized engine to do analytics
|
|
35
|
+
on these tables at scale.
|
|
36
|
+
3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
|
|
37
|
+
Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
|
|
38
|
+
|
|
39
|
+
Getting Started
|
|
40
|
+
===============
|
|
41
|
+
|
|
42
|
+
Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ and `Docs <https://docs.datachain.ai/>`_
|
|
43
|
+
to get started with `DataChain` and learn more.
|
|
44
|
+
|
|
45
|
+
Key Features
|
|
46
|
+
============
|
|
47
|
+
|
|
48
|
+
📂 **Multimodal Dataset Versioning.**
|
|
49
|
+
- Version unstructured data without moving or creating data copies, by supporting
|
|
50
|
+
references to S3, GCP, Azure, and local file systems.
|
|
51
|
+
- Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet, etc.
|
|
52
|
+
- Unite files and metadata together into persistent, versioned, columnar datasets.
|
|
53
|
+
|
|
54
|
+
🐍 **Python-friendly.**
|
|
55
|
+
- Operate on Python objects and object fields: float scores, strings, matrixes,
|
|
56
|
+
LLM response objects.
|
|
57
|
+
- Run Python code in a high-scale, terabytes size datasets, with built-in
|
|
58
|
+
parallelization and memory-efficient computing — no SQL or Spark required.
|
|
59
|
+
|
|
60
|
+
🧠 **Data Enrichment and Processing.**
|
|
61
|
+
- Generate metadata using local AI models and LLM APIs.
|
|
62
|
+
- Filter, join, and group datasets by metadata. Search by vector embeddings.
|
|
63
|
+
- High-performance vectorized operations on Python objects: sum, count, avg, etc.
|
|
64
|
+
- Pass datasets to Pytorch and Tensorflow, or export them back into storage.
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
Contributing
|
|
68
|
+
============
|
|
69
|
+
|
|
70
|
+
Contributions are very welcome. To learn more, see the `Contributor Guide`_.
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
Community and Support
|
|
74
|
+
=====================
|
|
75
|
+
|
|
76
|
+
* `Docs <https://docs.datachain.ai/>`_
|
|
77
|
+
* `File an issue`_ if you encounter any problems
|
|
78
|
+
* `Discord Chat <https://dvc.org/chat>`_
|
|
79
|
+
* `Email <mailto:support@dvc.org>`_
|
|
80
|
+
* `Twitter <https://twitter.com/DVCorg>`_
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
DataChain Studio Platform
|
|
84
|
+
=========================
|
|
85
|
+
|
|
86
|
+
`DataChain Studio`_ is a proprietary solution for teams that offers:
|
|
87
|
+
|
|
88
|
+
- **Centralized dataset registry** to manage data, code and dependency
|
|
89
|
+
dependencies in one place.
|
|
90
|
+
- **Data Lineage** for data sources as well as derivative dataset.
|
|
91
|
+
- **UI for Multimodal Data** like images, videos, and PDFs.
|
|
92
|
+
- **Scalable Compute** to handle large datasets (100M+ files) and in-house
|
|
93
|
+
AI model inference.
|
|
94
|
+
- **Access control** including SSO and team based collaboration.
|
|
95
|
+
|
|
96
|
+
.. _PyPI: https://pypi.org/
|
|
97
|
+
.. _file an issue: https://github.com/iterative/datachain/issues
|
|
98
|
+
.. github-only
|
|
99
|
+
.. _Contributor Guide: https://docs.datachain.ai/contributing
|
|
100
|
+
.. _Pydantic: https://github.com/pydantic/pydantic
|
|
101
|
+
.. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
|
|
102
|
+
.. _SQLite: https://www.sqlite.org/
|
|
103
|
+
.. _Getting Started: https://docs.datachain.ai/
|
|
104
|
+
.. _DataChain Studio: https://studio.datachain.ai/
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: Contributing
|
|
3
|
+
---
|
|
4
|
+
|
|
5
|
+
# Contributor Guide
|
|
6
|
+
|
|
7
|
+
Thank you for your interest in improving this project. This project is
|
|
8
|
+
open-source under the [Apache 2.0
|
|
9
|
+
license](https://opensource.org/licenses/Apache-2.0) and welcomes
|
|
10
|
+
contributions in the form of bug reports, feature requests, and pull
|
|
11
|
+
requests.
|
|
12
|
+
|
|
13
|
+
Here is a list of important resources for contributors:
|
|
14
|
+
|
|
15
|
+
- [Source Code](https://github.com/iterative/datachain)
|
|
16
|
+
- [Documentation](https://docs.dvc.ai/datachain)
|
|
17
|
+
- [Issue Tracker](https://github.com/iterative/datachain/issues)
|
|
18
|
+
- [Code of Conduct](https://github.com/iterative/datachain?tab=coc-ov-file)
|
|
19
|
+
|
|
20
|
+
## How to report a bug
|
|
21
|
+
|
|
22
|
+
Report bugs on the [Issue
|
|
23
|
+
Tracker](https://github.com/iterative/datachain/issues).
|
|
24
|
+
|
|
25
|
+
When filing an issue, make sure to answer these questions:
|
|
26
|
+
|
|
27
|
+
- Which operating system and Python version are you using?
|
|
28
|
+
- Which version of this project are you using?
|
|
29
|
+
- What did you do?
|
|
30
|
+
- What did you expect to see?
|
|
31
|
+
- What did you see instead?
|
|
32
|
+
|
|
33
|
+
The best way to get your bug fixed is to provide a test case, and/or
|
|
34
|
+
steps to reproduce the issue.
|
|
35
|
+
|
|
36
|
+
## How to request a feature
|
|
37
|
+
|
|
38
|
+
Request features on the [Issue
|
|
39
|
+
Tracker](https://github.com/iterative/datachain/issues).
|
|
40
|
+
|
|
41
|
+
## How to set up your development environment
|
|
42
|
+
|
|
43
|
+
You need Python 3.8+ and the following tools:
|
|
44
|
+
|
|
45
|
+
- [Nox](https://nox.thea.codes/)
|
|
46
|
+
|
|
47
|
+
Install the package with development requirements:
|
|
48
|
+
|
|
49
|
+
``` console
|
|
50
|
+
$ pip install nox
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## How to test the project
|
|
54
|
+
|
|
55
|
+
Run the full test suite:
|
|
56
|
+
|
|
57
|
+
``` console
|
|
58
|
+
$ nox
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
List the available Nox sessions:
|
|
62
|
+
|
|
63
|
+
``` console
|
|
64
|
+
$ nox --list-sessions
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
You can also run a specific Nox session. For example, invoke the unit
|
|
68
|
+
test suite like this:
|
|
69
|
+
|
|
70
|
+
``` console
|
|
71
|
+
$ nox --session=tests
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Unit tests are located in the `tests` directory, and are written using
|
|
75
|
+
the [pytest](https://pytest.readthedocs.io/) testing framework.
|
|
76
|
+
|
|
77
|
+
## Build documentation
|
|
78
|
+
|
|
79
|
+
If you've made any changes to the documentation (including changes to
|
|
80
|
+
function signatures, class definitions, or docstrings that will appear
|
|
81
|
+
in the API documentation), make sure it builds successfully.
|
|
82
|
+
|
|
83
|
+
``` console
|
|
84
|
+
$ nox -s docs
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
In order to run this locally with hot reload on changes:
|
|
88
|
+
|
|
89
|
+
``` console
|
|
90
|
+
$ mkdocs serve
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## How to submit changes
|
|
94
|
+
|
|
95
|
+
Open a [pull request](https://github.com/iterative/datachain/pulls) to
|
|
96
|
+
submit changes to this project.
|
|
97
|
+
|
|
98
|
+
Your pull request needs to meet the following guidelines for acceptance:
|
|
99
|
+
|
|
100
|
+
- The Nox test suite must pass without errors and warnings.
|
|
101
|
+
- Include unit tests. This project maintains 100% code coverage.
|
|
102
|
+
- If your changes add functionality, update the documentation
|
|
103
|
+
accordingly.
|
|
104
|
+
|
|
105
|
+
Feel free to submit early, though---we can always iterate on this.
|
|
106
|
+
|
|
107
|
+
To run linting and code formatting checks, you can invoke a `lint` session in nox:
|
|
108
|
+
|
|
109
|
+
``` console
|
|
110
|
+
$ nox -s lint
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
It is recommended to open an issue before starting work on anything.
|
|
114
|
+
This will allow a chance to talk it over with the owners and validate
|
|
115
|
+
your approach.
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
.headerlink {
|
|
2
|
+
--permalink-size: 16px; /* for font-relative sizes, 0.6em is a good choice */
|
|
3
|
+
--permalink-spacing: 4px;
|
|
4
|
+
|
|
5
|
+
width: calc(var(--permalink-size) + var(--permalink-spacing));
|
|
6
|
+
height: var(--permalink-size);
|
|
7
|
+
vertical-align: middle;
|
|
8
|
+
background-color: var(--md-default-fg-color--lighter);
|
|
9
|
+
background-size: var(--permalink-size);
|
|
10
|
+
mask-size: var(--permalink-size);
|
|
11
|
+
-webkit-mask-size: var(--permalink-size);
|
|
12
|
+
mask-repeat: no-repeat;
|
|
13
|
+
-webkit-mask-repeat: no-repeat;
|
|
14
|
+
visibility: visible;
|
|
15
|
+
mask-image: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" version="1.1" width="16" height="16" aria-hidden="true"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg>');
|
|
16
|
+
-webkit-mask-image: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" version="1.1" width="16" height="16" aria-hidden="true"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg>');
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
[id]:target .headerlink {
|
|
20
|
+
background-color: var(--md-typeset-a-color);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
.headerlink:hover {
|
|
24
|
+
background-color: var(--md-accent-fg-color) !important;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
@media screen and (min-width: 76.25em) {
|
|
28
|
+
h1, h2, h3, h4, h5, h6 {
|
|
29
|
+
display: flex;
|
|
30
|
+
align-items: center;
|
|
31
|
+
flex-direction: row;
|
|
32
|
+
column-gap: 0.2em; /* fixes spaces in titles */
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
.headerlink {
|
|
36
|
+
order: -1;
|
|
37
|
+
margin-left: calc(var(--permalink-size) * -1 - var(--permalink-spacing)) !important;
|
|
38
|
+
}
|
|
39
|
+
}
|