datachain 0.7.10__tar.gz → 0.7.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.7.10 → datachain-0.7.11}/.github/workflows/tests.yml +16 -3
- {datachain-0.7.10 → datachain-0.7.11}/.pre-commit-config.yaml +1 -1
- {datachain-0.7.10/src/datachain.egg-info → datachain-0.7.11}/PKG-INFO +9 -10
- {datachain-0.7.10 → datachain-0.7.11}/README.rst +5 -6
- {datachain-0.7.10 → datachain-0.7.11}/docs/contributing.md +4 -0
- datachain-0.7.11/docs/css/github-permalink-style.css +39 -0
- {datachain-0.7.10 → datachain-0.7.11}/docs/examples.md +4 -1
- {datachain-0.7.10 → datachain-0.7.11}/docs/index.md +4 -1
- {datachain-0.7.10 → datachain-0.7.11}/docs/quick-start.md +4 -0
- {datachain-0.7.10 → datachain-0.7.11}/docs/references/index.md +4 -0
- {datachain-0.7.10 → datachain-0.7.11}/docs/tutorials.md +4 -0
- {datachain-0.7.10 → datachain-0.7.11}/examples/get_started/torch-loader.py +25 -20
- {datachain-0.7.10 → datachain-0.7.11}/examples/llm_and_nlp/unstructured-embeddings-gen.py +7 -5
- {datachain-0.7.10 → datachain-0.7.11}/mkdocs.yml +18 -16
- {datachain-0.7.10 → datachain-0.7.11}/noxfile.py +2 -0
- {datachain-0.7.10 → datachain-0.7.11}/pyproject.toml +3 -3
- datachain-0.7.11/src/datachain/client/__init__.py +3 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/dc.py +5 -1
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/file.py +2 -1
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/meta_formats.py +2 -1
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/pytorch.py +1 -5
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/signal_schema.py +28 -6
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/query/dataset.py +4 -1
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/toolkit/split.py +19 -6
- {datachain-0.7.10 → datachain-0.7.11/src/datachain.egg-info}/PKG-INFO +9 -10
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain.egg-info/SOURCES.txt +1 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain.egg-info/requires.txt +3 -3
- {datachain-0.7.10 → datachain-0.7.11}/tests/conftest.py +12 -10
- {datachain-0.7.10 → datachain-0.7.11}/tests/examples/test_examples.py +14 -29
- datachain-0.7.11/tests/func/test_toolkit.py +51 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/lib/test_signal_schema.py +5 -0
- datachain-0.7.10/src/datachain/client/__init__.py +0 -4
- datachain-0.7.10/tests/func/test_toolkit.py +0 -42
- {datachain-0.7.10 → datachain-0.7.11}/.cruft.json +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/.gitattributes +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/.github/codecov.yaml +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/.github/dependabot.yml +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/.github/workflows/release.yml +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/.gitignore +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/LICENSE +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/docs/assets/datachain.svg +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/docs/overrides/main.html +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/docs/references/datachain.md +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/docs/references/datatype.md +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/docs/references/file.md +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/docs/references/sql.md +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/docs/references/torch.md +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/docs/references/udf.md +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/examples/get_started/json-csv-reader.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/examples/multimodal/wds.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/setup.cfg +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/__main__.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/asyn.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/cache.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/catalog/catalog.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/cli.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/cli_utils.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/client/azure.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/client/gcs.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/client/hf.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/client/local.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/client/s3.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/config.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/data_storage/sqlite.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/dataset.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/error.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/func/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/func/array.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/func/base.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/func/conditional.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/func/func.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/func/numeric.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/func/path.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/func/random.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/func/string.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/func/window.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/job.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/clip.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/hf.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/image.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/listing.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/settings.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/tar.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/text.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/udf.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/utils.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/listing.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/model/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/model/bbox.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/model/pose.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/model/segment.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/node.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/progress.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/py.typed +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/query/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/query/batch.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/query/metrics.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/query/params.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/query/queue.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/query/schema.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/query/session.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/remote/studio.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/sql/types.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/sql/utils.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/studio.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/telemetry.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain/utils.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/data.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/examples/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/examples/wds_data.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/func/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/func/test_catalog.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/func/test_client.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/func/test_datachain.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/func/test_datasets.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/func/test_listing.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/func/test_ls.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/func/test_meta_formats.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/func/test_metrics.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/func/test_pull.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/func/test_pytorch.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/func/test_query.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/scripts/feature_class.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/test_atomicity.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/test_cli_e2e.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/test_cli_studio.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/test_query_e2e.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/test_telemetry.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/lib/test_datachain.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/lib/test_models.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/test_asyn.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/test_cache.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/test_catalog.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/test_client.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/test_config.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/test_dataset.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/test_func.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/test_listing.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/test_metastore.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/test_query.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/test_query_params.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/test_serializer.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/test_session.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/test_utils.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.7.10 → datachain-0.7.11}/tests/utils.py +0 -0
|
@@ -136,7 +136,7 @@ jobs:
|
|
|
136
136
|
strategy:
|
|
137
137
|
fail-fast: false
|
|
138
138
|
matrix:
|
|
139
|
-
os: [ubuntu-latest,
|
|
139
|
+
os: [ubuntu-latest, windows-latest]
|
|
140
140
|
pyv: ['3.9', '3.12']
|
|
141
141
|
group: ['get_started', 'llm_and_nlp or computer_vision', 'multimodal']
|
|
142
142
|
exclude:
|
|
@@ -166,7 +166,20 @@ jobs:
|
|
|
166
166
|
- name: Install nox
|
|
167
167
|
run: uv pip install nox --system
|
|
168
168
|
|
|
169
|
+
# HF runs against actual API - thus run it only once
|
|
170
|
+
- name: Set hf token
|
|
171
|
+
if: matrix.os == 'ubuntu-latest' && matrix.pyv == '3.12'
|
|
172
|
+
run: echo 'HF_TOKEN=${{ secrets.HF_TOKEN }}' >> "$GITHUB_ENV"
|
|
173
|
+
|
|
169
174
|
- name: Run examples
|
|
170
|
-
env:
|
|
171
|
-
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
|
172
175
|
run: nox -s examples -p ${{ matrix.pyv }} -- -m "${{ matrix.group }}"
|
|
176
|
+
|
|
177
|
+
check:
|
|
178
|
+
if: always()
|
|
179
|
+
needs: [lint, datachain, examples]
|
|
180
|
+
runs-on: ubuntu-latest
|
|
181
|
+
steps:
|
|
182
|
+
- uses: re-actors/alls-green@release/v1
|
|
183
|
+
with:
|
|
184
|
+
allowed-failures: examples
|
|
185
|
+
jobs: ${{ toJSON(needs) }}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.11
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -91,14 +91,14 @@ Requires-Dist: types-requests; extra == "dev"
|
|
|
91
91
|
Requires-Dist: types-tabulate; extra == "dev"
|
|
92
92
|
Provides-Extra: examples
|
|
93
93
|
Requires-Dist: datachain[tests]; extra == "examples"
|
|
94
|
-
Requires-Dist: numpy<2,>=1; extra == "examples"
|
|
95
94
|
Requires-Dist: defusedxml; extra == "examples"
|
|
96
95
|
Requires-Dist: accelerate; extra == "examples"
|
|
97
|
-
Requires-Dist:
|
|
96
|
+
Requires-Dist: unstructured_ingest[embed-huggingface]; extra == "examples"
|
|
97
|
+
Requires-Dist: unstructured[pdf]; extra == "examples"
|
|
98
98
|
Requires-Dist: pdfplumber==0.11.4; extra == "examples"
|
|
99
99
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
100
100
|
Requires-Dist: onnx==1.16.1; extra == "examples"
|
|
101
|
-
Requires-Dist: ultralytics==8.3.
|
|
101
|
+
Requires-Dist: ultralytics==8.3.48; extra == "examples"
|
|
102
102
|
|
|
103
103
|
================
|
|
104
104
|
|logo| DataChain
|
|
@@ -138,6 +138,11 @@ Use Cases
|
|
|
138
138
|
3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
|
|
139
139
|
Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
|
|
140
140
|
|
|
141
|
+
Getting Started
|
|
142
|
+
===============
|
|
143
|
+
|
|
144
|
+
Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ and `Docs <https://docs.datachain.ai/>`_
|
|
145
|
+
to get started with `DataChain` and learn more.
|
|
141
146
|
|
|
142
147
|
Key Features
|
|
143
148
|
============
|
|
@@ -161,12 +166,6 @@ Key Features
|
|
|
161
166
|
- Pass datasets to Pytorch and Tensorflow, or export them back into storage.
|
|
162
167
|
|
|
163
168
|
|
|
164
|
-
Getting Started
|
|
165
|
-
===============
|
|
166
|
-
|
|
167
|
-
Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ to get started with `DataChain` and learn more.
|
|
168
|
-
|
|
169
|
-
|
|
170
169
|
Contributing
|
|
171
170
|
============
|
|
172
171
|
|
|
@@ -36,6 +36,11 @@ Use Cases
|
|
|
36
36
|
3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
|
|
37
37
|
Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
|
|
38
38
|
|
|
39
|
+
Getting Started
|
|
40
|
+
===============
|
|
41
|
+
|
|
42
|
+
Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ and `Docs <https://docs.datachain.ai/>`_
|
|
43
|
+
to get started with `DataChain` and learn more.
|
|
39
44
|
|
|
40
45
|
Key Features
|
|
41
46
|
============
|
|
@@ -59,12 +64,6 @@ Key Features
|
|
|
59
64
|
- Pass datasets to Pytorch and Tensorflow, or export them back into storage.
|
|
60
65
|
|
|
61
66
|
|
|
62
|
-
Getting Started
|
|
63
|
-
===============
|
|
64
|
-
|
|
65
|
-
Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ to get started with `DataChain` and learn more.
|
|
66
|
-
|
|
67
|
-
|
|
68
67
|
Contributing
|
|
69
68
|
============
|
|
70
69
|
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
.headerlink {
|
|
2
|
+
--permalink-size: 16px; /* for font-relative sizes, 0.6em is a good choice */
|
|
3
|
+
--permalink-spacing: 4px;
|
|
4
|
+
|
|
5
|
+
width: calc(var(--permalink-size) + var(--permalink-spacing));
|
|
6
|
+
height: var(--permalink-size);
|
|
7
|
+
vertical-align: middle;
|
|
8
|
+
background-color: var(--md-default-fg-color--lighter);
|
|
9
|
+
background-size: var(--permalink-size);
|
|
10
|
+
mask-size: var(--permalink-size);
|
|
11
|
+
-webkit-mask-size: var(--permalink-size);
|
|
12
|
+
mask-repeat: no-repeat;
|
|
13
|
+
-webkit-mask-repeat: no-repeat;
|
|
14
|
+
visibility: visible;
|
|
15
|
+
mask-image: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" version="1.1" width="16" height="16" aria-hidden="true"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg>');
|
|
16
|
+
-webkit-mask-image: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" version="1.1" width="16" height="16" aria-hidden="true"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg>');
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
[id]:target .headerlink {
|
|
20
|
+
background-color: var(--md-typeset-a-color);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
.headerlink:hover {
|
|
24
|
+
background-color: var(--md-accent-fg-color) !important;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
@media screen and (min-width: 76.25em) {
|
|
28
|
+
h1, h2, h3, h4, h5, h6 {
|
|
29
|
+
display: flex;
|
|
30
|
+
align-items: center;
|
|
31
|
+
flex-direction: row;
|
|
32
|
+
column-gap: 0.2em; /* fixes spaces in titles */
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
.headerlink {
|
|
36
|
+
order: -1;
|
|
37
|
+
margin-left: calc(var(--permalink-size) * -1 - var(--permalink-spacing)) !important;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: Examples
|
|
3
|
+
---
|
|
1
4
|
|
|
2
5
|
# Examples
|
|
3
6
|
|
|
@@ -225,7 +228,7 @@ Here is an example from MS COCO “captions” JSON which employs separate secti
|
|
|
225
228
|
}
|
|
226
229
|
```
|
|
227
230
|
|
|
228
|
-
Note how complicated the setup is. Every image is references by the name, and the metadata for this file is keyed by the “id” field. This same field is references later in the “annotations
|
|
231
|
+
Note how complicated the setup is. Every image is references by the name, and the metadata for this file is keyed by the “id” field. This same field is references later in the “annotations” array, which is present in JSON files describing captions and the detected instances. The categories for the instances are stored in the “categories” array.
|
|
229
232
|
|
|
230
233
|
However, Datachain can easily parse the entire COCO structure via several reading and merging operators:
|
|
231
234
|
|
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: Welcome to DataChain
|
|
3
|
+
---
|
|
1
4
|
# <a class="main-header-link" href="/" ><img style="display: inline-block;" src="/assets/datachain.svg" alt="DataChain"> <span style="display: inline-block;"> DataChain</span></a>
|
|
2
5
|
|
|
3
6
|
<style>
|
|
@@ -83,7 +86,7 @@ The following pages provide detailed documentation on DataChain's features, arch
|
|
|
83
86
|
- [🏃🏼♂️ Quick Start](quick-start.md): Get up and running with DataChain in no time.
|
|
84
87
|
- [🎯 Examples](examples.md): Explore practical examples and use cases.
|
|
85
88
|
- [📚 Tutorials](tutorials.md): Learn how to use DataChain for specific tasks.
|
|
86
|
-
- [
|
|
89
|
+
- [🐍 API Reference](references/index.md): Dive into the technical details and API reference.
|
|
87
90
|
- [🤝 Contributing](contributing.md): Learn how to contribute to DataChain.
|
|
88
91
|
|
|
89
92
|
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: Tutorials
|
|
3
|
+
---
|
|
4
|
+
|
|
1
5
|
# Tutorials
|
|
2
6
|
|
|
3
7
|
* Multimodal: [GitHub](https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb) or [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb)
|
|
@@ -5,6 +5,7 @@ To install the required dependencies:
|
|
|
5
5
|
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
+
import multiprocessing
|
|
8
9
|
import os
|
|
9
10
|
from posixpath import basename
|
|
10
11
|
|
|
@@ -12,17 +13,18 @@ import torch
|
|
|
12
13
|
from torch import nn, optim
|
|
13
14
|
from torch.utils.data import DataLoader
|
|
14
15
|
from torchvision.transforms import v2
|
|
16
|
+
from tqdm import tqdm
|
|
15
17
|
|
|
16
18
|
from datachain import C, DataChain
|
|
17
19
|
from datachain.torch import label_to_int
|
|
18
20
|
|
|
19
21
|
STORAGE = "gs://datachain-demo/dogs-and-cats/"
|
|
20
|
-
NUM_EPOCHS = os.getenv("NUM_EPOCHS", "3")
|
|
22
|
+
NUM_EPOCHS = int(os.getenv("NUM_EPOCHS", "3"))
|
|
21
23
|
|
|
22
24
|
# Define transformation for data preprocessing
|
|
23
25
|
transform = v2.Compose(
|
|
24
26
|
[
|
|
25
|
-
v2.
|
|
27
|
+
v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)]),
|
|
26
28
|
v2.Resize((64, 64)),
|
|
27
29
|
v2.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
|
|
28
30
|
]
|
|
@@ -54,6 +56,7 @@ class CNN(nn.Module):
|
|
|
54
56
|
if __name__ == "__main__":
|
|
55
57
|
ds = (
|
|
56
58
|
DataChain.from_storage(STORAGE, type="image")
|
|
59
|
+
.settings(cache=True, prefetch=25)
|
|
57
60
|
.filter(C("file.path").glob("*.jpg"))
|
|
58
61
|
.map(
|
|
59
62
|
label=lambda path: label_to_int(basename(path)[:3], CLASSES),
|
|
@@ -64,8 +67,10 @@ if __name__ == "__main__":
|
|
|
64
67
|
|
|
65
68
|
train_loader = DataLoader(
|
|
66
69
|
ds.to_pytorch(transform=transform),
|
|
67
|
-
batch_size=
|
|
68
|
-
num_workers=2,
|
|
70
|
+
batch_size=25,
|
|
71
|
+
num_workers=max(4, os.cpu_count() or 2),
|
|
72
|
+
persistent_workers=True,
|
|
73
|
+
multiprocessing_context=multiprocessing.get_context("spawn"),
|
|
69
74
|
)
|
|
70
75
|
|
|
71
76
|
model = CNN()
|
|
@@ -73,19 +78,19 @@ if __name__ == "__main__":
|
|
|
73
78
|
optimizer = optim.Adam(model.parameters(), lr=0.001)
|
|
74
79
|
|
|
75
80
|
# Train the model
|
|
76
|
-
for epoch in range(
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
81
|
+
for epoch in range(NUM_EPOCHS):
|
|
82
|
+
with tqdm(
|
|
83
|
+
train_loader, desc=f"epoch {epoch + 1}/{NUM_EPOCHS}", unit="batch"
|
|
84
|
+
) as loader:
|
|
85
|
+
for data in loader:
|
|
86
|
+
inputs, labels = data
|
|
87
|
+
optimizer.zero_grad()
|
|
88
|
+
|
|
89
|
+
# Forward pass
|
|
90
|
+
outputs = model(inputs)
|
|
91
|
+
loss = criterion(outputs, labels)
|
|
92
|
+
|
|
93
|
+
# Backward pass and optimize
|
|
94
|
+
loss.backward()
|
|
95
|
+
optimizer.step()
|
|
96
|
+
loader.set_postfix(loss=loss.item())
|
|
@@ -12,11 +12,11 @@ from unstructured.cleaners.core import (
|
|
|
12
12
|
group_broken_paragraphs,
|
|
13
13
|
replace_unicode_quotes,
|
|
14
14
|
)
|
|
15
|
-
from unstructured.
|
|
15
|
+
from unstructured.partition.pdf import partition_pdf
|
|
16
|
+
from unstructured_ingest.embed.huggingface import (
|
|
16
17
|
HuggingFaceEmbeddingConfig,
|
|
17
18
|
HuggingFaceEmbeddingEncoder,
|
|
18
19
|
)
|
|
19
|
-
from unstructured.partition.pdf import partition_pdf
|
|
20
20
|
|
|
21
21
|
from datachain import C, DataChain, DataModel, File
|
|
22
22
|
|
|
@@ -43,6 +43,7 @@ def process_pdf(file: File) -> Iterator[Chunk]:
|
|
|
43
43
|
chunks = partition_pdf(file=f, chunking_strategy="by_title", strategy="fast")
|
|
44
44
|
|
|
45
45
|
# Clean the chunks and add new columns
|
|
46
|
+
text_chunks = []
|
|
46
47
|
for chunk in chunks:
|
|
47
48
|
chunk.apply(
|
|
48
49
|
lambda text: clean(
|
|
@@ -51,16 +52,17 @@ def process_pdf(file: File) -> Iterator[Chunk]:
|
|
|
51
52
|
)
|
|
52
53
|
chunk.apply(replace_unicode_quotes)
|
|
53
54
|
chunk.apply(group_broken_paragraphs)
|
|
55
|
+
text_chunks.append({"text": str(chunk)})
|
|
54
56
|
|
|
55
57
|
# create embeddings
|
|
56
|
-
chunks_embedded = embedding_encoder.embed_documents(
|
|
58
|
+
chunks_embedded = embedding_encoder.embed_documents(text_chunks)
|
|
57
59
|
|
|
58
60
|
# Add new rows to DataChain
|
|
59
61
|
for chunk in chunks_embedded:
|
|
60
62
|
yield Chunk(
|
|
61
63
|
key=file.path,
|
|
62
|
-
text=chunk.text,
|
|
63
|
-
embeddings=chunk.embeddings,
|
|
64
|
+
text=chunk.get("text"),
|
|
65
|
+
embeddings=chunk.get("embeddings"),
|
|
64
66
|
)
|
|
65
67
|
|
|
66
68
|
|
|
@@ -27,7 +27,6 @@ theme:
|
|
|
27
27
|
- navigation.tabs
|
|
28
28
|
- navigation.path
|
|
29
29
|
- navigation.top
|
|
30
|
-
- navigation.prune
|
|
31
30
|
- navigation.footer
|
|
32
31
|
- toc.follow
|
|
33
32
|
- content.action.edit
|
|
@@ -37,7 +36,6 @@ theme:
|
|
|
37
36
|
- content.tooltips
|
|
38
37
|
- search.highlight
|
|
39
38
|
- search.suggest
|
|
40
|
-
- navigation.sections
|
|
41
39
|
|
|
42
40
|
palette:
|
|
43
41
|
# Palette toggle for automatic mode
|
|
@@ -56,8 +54,8 @@ theme:
|
|
|
56
54
|
# Palette toggle for dark mode
|
|
57
55
|
- media: "(prefers-color-scheme: dark)"
|
|
58
56
|
scheme: slate
|
|
59
|
-
primary:
|
|
60
|
-
accent:
|
|
57
|
+
primary: teal
|
|
58
|
+
accent: teal
|
|
61
59
|
toggle:
|
|
62
60
|
icon: material/weather-night
|
|
63
61
|
name: Switch to system preference
|
|
@@ -68,18 +66,18 @@ nav:
|
|
|
68
66
|
- 🏃🏼♂️ Quick Start: quick-start.md
|
|
69
67
|
- 🎯 Examples: examples.md
|
|
70
68
|
- 📚 Tutorials: tutorials.md
|
|
71
|
-
- 🐍 API Reference:
|
|
69
|
+
- 🐍 API Reference:
|
|
70
|
+
- Overview: references/index.md
|
|
71
|
+
- DataChain: references/datachain.md
|
|
72
|
+
- DataType: references/datatype.md
|
|
73
|
+
- File: references/file.md
|
|
74
|
+
- UDF: references/udf.md
|
|
75
|
+
- Torch: references/torch.md
|
|
76
|
+
- SQL: references/sql.md
|
|
72
77
|
- 🤝 Contributing: contributing.md
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
- references/datatype.md
|
|
77
|
-
- references/file.md
|
|
78
|
-
- references/udf.md
|
|
79
|
-
- references/torch.md
|
|
80
|
-
- references/sql.md
|
|
81
|
-
- DataChain Website: https://datachain.ai" target="_blank"
|
|
82
|
-
- Studio: https://studio.datachain.ai" target="_blank"
|
|
78
|
+
|
|
79
|
+
- DataChain Website ↗: https://datachain.ai" target="_blank"
|
|
80
|
+
- Studio ↗: https://studio.datachain.ai" target="_blank"
|
|
83
81
|
|
|
84
82
|
markdown_extensions:
|
|
85
83
|
- abbr
|
|
@@ -105,7 +103,11 @@ markdown_extensions:
|
|
|
105
103
|
- pymdownx.tilde
|
|
106
104
|
- tables
|
|
107
105
|
- toc:
|
|
108
|
-
permalink:
|
|
106
|
+
permalink: ''
|
|
107
|
+
|
|
108
|
+
# Custom permalink style: https://github.com/squidfunk/mkdocs-material/discussions/3535
|
|
109
|
+
extra_css:
|
|
110
|
+
- css/github-permalink-style.css
|
|
109
111
|
|
|
110
112
|
extra:
|
|
111
113
|
social:
|
|
@@ -104,14 +104,14 @@ dev = [
|
|
|
104
104
|
]
|
|
105
105
|
examples = [
|
|
106
106
|
"datachain[tests]",
|
|
107
|
-
"numpy>=1,<2",
|
|
108
107
|
"defusedxml",
|
|
109
108
|
"accelerate",
|
|
110
|
-
"
|
|
109
|
+
"unstructured_ingest[embed-huggingface]",
|
|
110
|
+
"unstructured[pdf]",
|
|
111
111
|
"pdfplumber==0.11.4",
|
|
112
112
|
"huggingface_hub[hf_transfer]",
|
|
113
113
|
"onnx==1.16.1",
|
|
114
|
-
"ultralytics==8.3.
|
|
114
|
+
"ultralytics==8.3.48"
|
|
115
115
|
]
|
|
116
116
|
|
|
117
117
|
[project.urls]
|
|
@@ -19,7 +19,6 @@ from typing import (
|
|
|
19
19
|
)
|
|
20
20
|
|
|
21
21
|
import orjson
|
|
22
|
-
import pandas as pd
|
|
23
22
|
import sqlalchemy
|
|
24
23
|
from pydantic import BaseModel
|
|
25
24
|
from sqlalchemy.sql.functions import GenericFunction
|
|
@@ -57,6 +56,7 @@ from datachain.telemetry import telemetry
|
|
|
57
56
|
from datachain.utils import batched_it, inside_notebook, row_to_nested_dict
|
|
58
57
|
|
|
59
58
|
if TYPE_CHECKING:
|
|
59
|
+
import pandas as pd
|
|
60
60
|
from pyarrow import DataType as ArrowDataType
|
|
61
61
|
from typing_extensions import Concatenate, ParamSpec, Self
|
|
62
62
|
|
|
@@ -1701,6 +1701,8 @@ class DataChain:
|
|
|
1701
1701
|
Parameters:
|
|
1702
1702
|
flatten : Whether to use a multiindex or flatten column names.
|
|
1703
1703
|
"""
|
|
1704
|
+
import pandas as pd
|
|
1705
|
+
|
|
1704
1706
|
headers, max_length = self._effective_signals_schema.get_headers_with_length()
|
|
1705
1707
|
if flatten or max_length < 2:
|
|
1706
1708
|
columns = [".".join(filter(None, header)) for header in headers]
|
|
@@ -1724,6 +1726,8 @@ class DataChain:
|
|
|
1724
1726
|
transpose : Whether to transpose rows and columns.
|
|
1725
1727
|
truncate : Whether or not to truncate the contents of columns.
|
|
1726
1728
|
"""
|
|
1729
|
+
import pandas as pd
|
|
1730
|
+
|
|
1727
1731
|
dc = self.limit(limit) if limit > 0 else self # type: ignore[misc]
|
|
1728
1732
|
df = dc.to_pandas(flatten)
|
|
1729
1733
|
|
|
@@ -17,7 +17,6 @@ from urllib.request import url2pathname
|
|
|
17
17
|
|
|
18
18
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
19
19
|
from PIL import Image
|
|
20
|
-
from pyarrow.dataset import dataset
|
|
21
20
|
from pydantic import Field, field_validator
|
|
22
21
|
|
|
23
22
|
from datachain.client.fileslice import FileSlice
|
|
@@ -452,6 +451,8 @@ class ArrowRow(DataModel):
|
|
|
452
451
|
@contextmanager
|
|
453
452
|
def open(self):
|
|
454
453
|
"""Stream row contents from indexed file."""
|
|
454
|
+
from pyarrow.dataset import dataset
|
|
455
|
+
|
|
455
456
|
if self.file._caching_enabled:
|
|
456
457
|
self.file.ensure_cached()
|
|
457
458
|
path = self.file.get_local_path()
|
|
@@ -6,7 +6,6 @@ from collections.abc import Iterator
|
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
from typing import Callable
|
|
8
8
|
|
|
9
|
-
import datamodel_code_generator
|
|
10
9
|
import jmespath as jsp
|
|
11
10
|
from pydantic import BaseModel, ConfigDict, Field, ValidationError # noqa: F401
|
|
12
11
|
|
|
@@ -67,6 +66,8 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
|
|
|
67
66
|
data_type = "json" # treat json line as plain JSON in auto-schema
|
|
68
67
|
data_string = json.dumps(json_object)
|
|
69
68
|
|
|
69
|
+
import datamodel_code_generator
|
|
70
|
+
|
|
70
71
|
input_file_types = {i.value: i for i in datamodel_code_generator.InputFileType}
|
|
71
72
|
input_file_type = input_file_types[data_type]
|
|
72
73
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
@@ -7,7 +7,6 @@ from torch import float32
|
|
|
7
7
|
from torch.distributed import get_rank, get_world_size
|
|
8
8
|
from torch.utils.data import IterableDataset, get_worker_info
|
|
9
9
|
from torchvision.transforms import v2
|
|
10
|
-
from tqdm import tqdm
|
|
11
10
|
|
|
12
11
|
from datachain import Session
|
|
13
12
|
from datachain.asyn import AsyncMapper
|
|
@@ -112,10 +111,7 @@ class PytorchDataset(IterableDataset):
|
|
|
112
111
|
from datachain.lib.udf import _prefetch_input
|
|
113
112
|
|
|
114
113
|
rows = AsyncMapper(_prefetch_input, rows, workers=self.prefetch).iterate()
|
|
115
|
-
|
|
116
|
-
desc = f"Parsed PyTorch dataset for rank={total_rank} worker"
|
|
117
|
-
with tqdm(rows, desc=desc, unit=" rows", position=total_rank) as rows_it:
|
|
118
|
-
yield from map(self._process_row, rows_it)
|
|
114
|
+
yield from map(self._process_row, rows)
|
|
119
115
|
|
|
120
116
|
def _process_row(self, row_features):
|
|
121
117
|
row = []
|
|
@@ -402,9 +402,20 @@ class SignalSchema:
|
|
|
402
402
|
if ModelStore.is_pydantic(finfo.annotation):
|
|
403
403
|
SignalSchema._set_file_stream(getattr(obj, field), catalog, cache)
|
|
404
404
|
|
|
405
|
-
def get_column_type(self, col_name: str) -> DataType:
|
|
405
|
+
def get_column_type(self, col_name: str, with_subtree: bool = False) -> DataType:
|
|
406
|
+
"""
|
|
407
|
+
Returns column type by column name.
|
|
408
|
+
|
|
409
|
+
If `with_subtree` is True, then it will return the type of the column
|
|
410
|
+
even if it has a subtree (e.g. model with nested fields), otherwise it will
|
|
411
|
+
return the type of the column (standard type field, not the model).
|
|
412
|
+
|
|
413
|
+
If column is not found, raises `SignalResolvingError`.
|
|
414
|
+
"""
|
|
406
415
|
for path, _type, has_subtree, _ in self.get_flat_tree():
|
|
407
|
-
if not has_subtree and DEFAULT_DELIMITER.join(
|
|
416
|
+
if (with_subtree or not has_subtree) and DEFAULT_DELIMITER.join(
|
|
417
|
+
path
|
|
418
|
+
) == col_name:
|
|
408
419
|
return _type
|
|
409
420
|
raise SignalResolvingError([col_name], "is not found")
|
|
410
421
|
|
|
@@ -492,14 +503,25 @@ class SignalSchema:
|
|
|
492
503
|
# renaming existing signal
|
|
493
504
|
del new_values[value.name]
|
|
494
505
|
new_values[name] = self.values[value.name]
|
|
495
|
-
|
|
506
|
+
continue
|
|
507
|
+
if isinstance(value, Column):
|
|
508
|
+
# adding new signal from existing signal field
|
|
509
|
+
try:
|
|
510
|
+
new_values[name] = self.get_column_type(
|
|
511
|
+
value.name, with_subtree=True
|
|
512
|
+
)
|
|
513
|
+
continue
|
|
514
|
+
except SignalResolvingError:
|
|
515
|
+
pass
|
|
516
|
+
if isinstance(value, Func):
|
|
496
517
|
# adding new signal with function
|
|
497
518
|
new_values[name] = value.get_result_type(self)
|
|
498
|
-
|
|
519
|
+
continue
|
|
520
|
+
if isinstance(value, ColumnElement):
|
|
499
521
|
# adding new signal
|
|
500
522
|
new_values[name] = sql_to_python(value)
|
|
501
|
-
|
|
502
|
-
|
|
523
|
+
continue
|
|
524
|
+
new_values[name] = value
|
|
503
525
|
|
|
504
526
|
return SignalSchema(new_values)
|
|
505
527
|
|
|
@@ -35,7 +35,6 @@ from sqlalchemy.sql.schema import TableClause
|
|
|
35
35
|
from sqlalchemy.sql.selectable import Select
|
|
36
36
|
|
|
37
37
|
from datachain.asyn import ASYNC_WORKERS, AsyncMapper, OrderedMapper
|
|
38
|
-
from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE, get_catalog
|
|
39
38
|
from datachain.data_storage.schema import (
|
|
40
39
|
PARTITION_COLUMN_ID,
|
|
41
40
|
partition_col_names,
|
|
@@ -394,6 +393,8 @@ class UDFStep(Step, ABC):
|
|
|
394
393
|
"""
|
|
395
394
|
|
|
396
395
|
def populate_udf_table(self, udf_table: "Table", query: Select) -> None:
|
|
396
|
+
from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE
|
|
397
|
+
|
|
397
398
|
use_partitioning = self.partition_by is not None
|
|
398
399
|
batching = self.udf.get_batching(use_partitioning)
|
|
399
400
|
workers = self.workers
|
|
@@ -1087,6 +1088,8 @@ class DatasetQuery:
|
|
|
1087
1088
|
def delete(
|
|
1088
1089
|
name: str, version: Optional[int] = None, catalog: Optional["Catalog"] = None
|
|
1089
1090
|
) -> None:
|
|
1091
|
+
from datachain.catalog import get_catalog
|
|
1092
|
+
|
|
1090
1093
|
catalog = catalog or get_catalog()
|
|
1091
1094
|
version = version or catalog.get_dataset(name).latest_version
|
|
1092
1095
|
catalog.remove_dataset(name, version)
|