datachain 0.7.10__tar.gz → 0.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- {datachain-0.7.10 → datachain-0.8.0}/.github/workflows/tests.yml +16 -3
- {datachain-0.7.10 → datachain-0.8.0}/.pre-commit-config.yaml +1 -1
- {datachain-0.7.10/src/datachain.egg-info → datachain-0.8.0}/PKG-INFO +10 -10
- {datachain-0.7.10 → datachain-0.8.0}/README.rst +5 -6
- {datachain-0.7.10 → datachain-0.8.0}/docs/contributing.md +4 -0
- datachain-0.8.0/docs/css/github-permalink-style.css +39 -0
- {datachain-0.7.10 → datachain-0.8.0}/docs/examples.md +4 -1
- {datachain-0.7.10 → datachain-0.8.0}/docs/index.md +4 -1
- {datachain-0.7.10 → datachain-0.8.0}/docs/quick-start.md +4 -0
- {datachain-0.7.10 → datachain-0.8.0}/docs/references/index.md +4 -0
- {datachain-0.7.10 → datachain-0.8.0}/docs/tutorials.md +4 -0
- datachain-0.8.0/examples/get_started/json-csv-reader.py +82 -0
- {datachain-0.7.10 → datachain-0.8.0}/examples/get_started/torch-loader.py +25 -20
- {datachain-0.7.10 → datachain-0.8.0}/examples/llm_and_nlp/unstructured-embeddings-gen.py +7 -5
- {datachain-0.7.10 → datachain-0.8.0}/mkdocs.yml +18 -16
- {datachain-0.7.10 → datachain-0.8.0}/noxfile.py +2 -0
- {datachain-0.7.10 → datachain-0.8.0}/pyproject.toml +5 -4
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/catalog/catalog.py +53 -41
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/cli.py +25 -3
- datachain-0.8.0/src/datachain/client/__init__.py +3 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/data_storage/sqlite.py +20 -6
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/dc.py +160 -110
- datachain-0.8.0/src/datachain/lib/diff.py +197 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/file.py +2 -1
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/meta_formats.py +40 -43
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/pytorch.py +1 -5
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/signal_schema.py +28 -6
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/query/dataset.py +5 -1
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/remote/studio.py +53 -1
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/studio.py +47 -2
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/toolkit/split.py +19 -6
- {datachain-0.7.10 → datachain-0.8.0/src/datachain.egg-info}/PKG-INFO +10 -10
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain.egg-info/SOURCES.txt +3 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain.egg-info/requires.txt +4 -3
- {datachain-0.7.10 → datachain-0.8.0}/tests/conftest.py +12 -10
- {datachain-0.7.10 → datachain-0.8.0}/tests/examples/test_examples.py +14 -29
- {datachain-0.7.10 → datachain-0.8.0}/tests/func/test_datachain.py +1 -1
- {datachain-0.7.10 → datachain-0.8.0}/tests/func/test_meta_formats.py +4 -4
- {datachain-0.7.10 → datachain-0.8.0}/tests/func/test_pull.py +18 -12
- datachain-0.8.0/tests/func/test_toolkit.py +51 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/test_cli_studio.py +52 -1
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_datachain.py +3 -3
- datachain-0.8.0/tests/unit/lib/test_diff.py +498 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_signal_schema.py +5 -0
- datachain-0.7.10/examples/get_started/json-csv-reader.py +0 -101
- datachain-0.7.10/src/datachain/client/__init__.py +0 -4
- datachain-0.7.10/tests/func/test_toolkit.py +0 -42
- {datachain-0.7.10 → datachain-0.8.0}/.cruft.json +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/.gitattributes +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/.github/ISSUE_TEMPLATE/empty_issue.md +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/.github/codecov.yaml +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/.github/dependabot.yml +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/.github/workflows/benchmarks.yml +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/.github/workflows/release.yml +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/.github/workflows/tests-studio.yml +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/.github/workflows/update-template.yaml +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/.gitignore +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/CODE_OF_CONDUCT.rst +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/LICENSE +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/docs/assets/captioned_cartoons.png +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/docs/assets/datachain-white.svg +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/docs/assets/datachain.svg +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/docs/overrides/main.html +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/docs/references/datachain.md +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/docs/references/datatype.md +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/docs/references/file.md +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/docs/references/sql.md +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/docs/references/torch.md +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/docs/references/udf.md +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/examples/computer_vision/iptc_exif_xmp_lib.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/examples/computer_vision/llava2_image_desc_lib.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/examples/computer_vision/openimage-detect.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/examples/computer_vision/ultralytics-bbox.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/examples/computer_vision/ultralytics-pose.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/examples/computer_vision/ultralytics-segment.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/examples/get_started/common_sql_functions.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/examples/get_started/udfs/parallel.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/examples/get_started/udfs/simple.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/examples/get_started/udfs/stateful.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/examples/llm_and_nlp/claude-query.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/examples/llm_and_nlp/hf-dataset-llm-eval.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/examples/llm_and_nlp/unstructured-summary-map.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/examples/multimodal/clip_inference.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/examples/multimodal/hf_pipeline.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/examples/multimodal/openai_image_desc_lib.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/examples/multimodal/wds.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/examples/multimodal/wds_filtered.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/setup.cfg +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/__main__.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/asyn.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/cache.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/catalog/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/catalog/datasource.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/catalog/loader.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/cli_utils.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/client/azure.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/client/fileslice.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/client/fsspec.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/client/gcs.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/client/hf.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/client/local.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/client/s3.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/config.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/data_storage/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/data_storage/db_engine.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/data_storage/job.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/data_storage/metastore.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/data_storage/schema.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/data_storage/serializer.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/data_storage/warehouse.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/dataset.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/error.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/func/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/func/aggregate.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/func/array.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/func/base.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/func/conditional.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/func/func.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/func/numeric.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/func/path.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/func/random.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/func/string.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/func/window.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/job.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/arrow.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/clip.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/convert/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/convert/flatten.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/convert/python_to_sql.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/convert/sql_to_python.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/convert/unflatten.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/convert/values_to_tuples.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/data_model.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/dataset_info.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/hf.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/image.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/listing.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/listing_info.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/model_store.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/settings.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/tar.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/text.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/udf.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/udf_signature.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/utils.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/vfile.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/webdataset.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/lib/webdataset_laion.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/listing.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/model/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/model/bbox.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/model/pose.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/model/segment.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/model/ultralytics/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/model/ultralytics/bbox.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/model/ultralytics/pose.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/model/ultralytics/segment.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/node.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/nodes_fetcher.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/nodes_thread_pool.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/progress.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/py.typed +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/query/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/query/batch.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/query/dispatch.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/query/metrics.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/query/params.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/query/queue.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/query/schema.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/query/session.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/remote/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/default/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/default/base.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/functions/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/functions/aggregate.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/functions/array.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/functions/conditional.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/functions/numeric.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/functions/path.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/functions/random.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/functions/string.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/selectable.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/sqlite/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/sqlite/base.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/sqlite/types.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/sqlite/vector.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/types.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/sql/utils.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/telemetry.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/toolkit/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/torch/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain/utils.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain.egg-info/dependency_links.txt +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain.egg-info/entry_points.txt +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/src/datachain.egg-info/top_level.txt +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/benchmarks/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/benchmarks/conftest.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/benchmarks/datasets/.dvc/.gitignore +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/benchmarks/datasets/.dvc/config +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/benchmarks/datasets/.gitignore +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/benchmarks/datasets/laion-tiny.npz.dvc +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/benchmarks/test_datachain.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/benchmarks/test_ls.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/benchmarks/test_version.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/data.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/examples/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/examples/test_wds_e2e.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/examples/wds_data.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/func/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/func/test_catalog.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/func/test_client.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/func/test_dataset_query.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/func/test_datasets.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/func/test_feature_pickling.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/func/test_listing.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/func/test_ls.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/func/test_metrics.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/func/test_pytorch.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/func/test_query.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/scripts/feature_class.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/scripts/feature_class_exception.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/scripts/feature_class_parallel.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/scripts/feature_class_parallel_data_model.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/scripts/name_len_slow.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/test_atomicity.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/test_cli_e2e.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/test_query_e2e.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/test_telemetry.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/conftest.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_arrow.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_clip.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_datachain_bootstrap.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_datachain_merge.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_feature.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_feature_utils.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_file.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_hf.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_image.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_listing_info.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_models.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_schema.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_sql_to_python.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_text.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_udf_signature.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_utils.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/lib/test_webdataset.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/sql/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/sql/sqlite/__init__.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/sql/sqlite/test_types.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/sql/sqlite/test_utils.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/sql/test_array.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/sql/test_conditional.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/sql/test_path.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/sql/test_random.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/sql/test_selectable.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/sql/test_string.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_asyn.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_cache.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_catalog.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_catalog_loader.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_cli_parsing.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_client.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_client_s3.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_config.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_data_storage.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_database_engine.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_dataset.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_dispatch.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_fileslice.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_func.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_listing.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_metastore.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_module_exports.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_query.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_query_metrics.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_query_params.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_serializer.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_session.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_utils.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/unit/test_warehouse.py +0 -0
- {datachain-0.7.10 → datachain-0.8.0}/tests/utils.py +0 -0
|
@@ -136,7 +136,7 @@ jobs:
|
|
|
136
136
|
strategy:
|
|
137
137
|
fail-fast: false
|
|
138
138
|
matrix:
|
|
139
|
-
os: [ubuntu-latest,
|
|
139
|
+
os: [ubuntu-latest, windows-latest]
|
|
140
140
|
pyv: ['3.9', '3.12']
|
|
141
141
|
group: ['get_started', 'llm_and_nlp or computer_vision', 'multimodal']
|
|
142
142
|
exclude:
|
|
@@ -166,7 +166,20 @@ jobs:
|
|
|
166
166
|
- name: Install nox
|
|
167
167
|
run: uv pip install nox --system
|
|
168
168
|
|
|
169
|
+
# HF runs against actual API - thus run it only once
|
|
170
|
+
- name: Set hf token
|
|
171
|
+
if: matrix.os == 'ubuntu-latest' && matrix.pyv == '3.12'
|
|
172
|
+
run: echo 'HF_TOKEN=${{ secrets.HF_TOKEN }}' >> "$GITHUB_ENV"
|
|
173
|
+
|
|
169
174
|
- name: Run examples
|
|
170
|
-
env:
|
|
171
|
-
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
|
172
175
|
run: nox -s examples -p ${{ matrix.pyv }} -- -m "${{ matrix.group }}"
|
|
176
|
+
|
|
177
|
+
check:
|
|
178
|
+
if: always()
|
|
179
|
+
needs: [lint, datachain, examples]
|
|
180
|
+
runs-on: ubuntu-latest
|
|
181
|
+
steps:
|
|
182
|
+
- uses: re-actors/alls-green@release/v1
|
|
183
|
+
with:
|
|
184
|
+
allowed-failures: examples
|
|
185
|
+
jobs: ${{ toJSON(needs) }}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -46,6 +46,7 @@ Requires-Dist: iterative-telemetry>=0.0.9
|
|
|
46
46
|
Requires-Dist: platformdirs
|
|
47
47
|
Requires-Dist: dvc-studio-client<1,>=0.21
|
|
48
48
|
Requires-Dist: tabulate
|
|
49
|
+
Requires-Dist: websockets
|
|
49
50
|
Provides-Extra: docs
|
|
50
51
|
Requires-Dist: mkdocs>=1.5.2; extra == "docs"
|
|
51
52
|
Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
|
|
@@ -91,14 +92,14 @@ Requires-Dist: types-requests; extra == "dev"
|
|
|
91
92
|
Requires-Dist: types-tabulate; extra == "dev"
|
|
92
93
|
Provides-Extra: examples
|
|
93
94
|
Requires-Dist: datachain[tests]; extra == "examples"
|
|
94
|
-
Requires-Dist: numpy<2,>=1; extra == "examples"
|
|
95
95
|
Requires-Dist: defusedxml; extra == "examples"
|
|
96
96
|
Requires-Dist: accelerate; extra == "examples"
|
|
97
|
-
Requires-Dist:
|
|
97
|
+
Requires-Dist: unstructured_ingest[embed-huggingface]; extra == "examples"
|
|
98
|
+
Requires-Dist: unstructured[pdf]; extra == "examples"
|
|
98
99
|
Requires-Dist: pdfplumber==0.11.4; extra == "examples"
|
|
99
100
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
100
101
|
Requires-Dist: onnx==1.16.1; extra == "examples"
|
|
101
|
-
Requires-Dist: ultralytics==8.3.
|
|
102
|
+
Requires-Dist: ultralytics==8.3.50; extra == "examples"
|
|
102
103
|
|
|
103
104
|
================
|
|
104
105
|
|logo| DataChain
|
|
@@ -138,6 +139,11 @@ Use Cases
|
|
|
138
139
|
3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
|
|
139
140
|
Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
|
|
140
141
|
|
|
142
|
+
Getting Started
|
|
143
|
+
===============
|
|
144
|
+
|
|
145
|
+
Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ and `Docs <https://docs.datachain.ai/>`_
|
|
146
|
+
to get started with `DataChain` and learn more.
|
|
141
147
|
|
|
142
148
|
Key Features
|
|
143
149
|
============
|
|
@@ -161,12 +167,6 @@ Key Features
|
|
|
161
167
|
- Pass datasets to Pytorch and Tensorflow, or export them back into storage.
|
|
162
168
|
|
|
163
169
|
|
|
164
|
-
Getting Started
|
|
165
|
-
===============
|
|
166
|
-
|
|
167
|
-
Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ to get started with `DataChain` and learn more.
|
|
168
|
-
|
|
169
|
-
|
|
170
170
|
Contributing
|
|
171
171
|
============
|
|
172
172
|
|
|
@@ -36,6 +36,11 @@ Use Cases
|
|
|
36
36
|
3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
|
|
37
37
|
Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
|
|
38
38
|
|
|
39
|
+
Getting Started
|
|
40
|
+
===============
|
|
41
|
+
|
|
42
|
+
Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ and `Docs <https://docs.datachain.ai/>`_
|
|
43
|
+
to get started with `DataChain` and learn more.
|
|
39
44
|
|
|
40
45
|
Key Features
|
|
41
46
|
============
|
|
@@ -59,12 +64,6 @@ Key Features
|
|
|
59
64
|
- Pass datasets to Pytorch and Tensorflow, or export them back into storage.
|
|
60
65
|
|
|
61
66
|
|
|
62
|
-
Getting Started
|
|
63
|
-
===============
|
|
64
|
-
|
|
65
|
-
Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ to get started with `DataChain` and learn more.
|
|
66
|
-
|
|
67
|
-
|
|
68
67
|
Contributing
|
|
69
68
|
============
|
|
70
69
|
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
.headerlink {
|
|
2
|
+
--permalink-size: 16px; /* for font-relative sizes, 0.6em is a good choice */
|
|
3
|
+
--permalink-spacing: 4px;
|
|
4
|
+
|
|
5
|
+
width: calc(var(--permalink-size) + var(--permalink-spacing));
|
|
6
|
+
height: var(--permalink-size);
|
|
7
|
+
vertical-align: middle;
|
|
8
|
+
background-color: var(--md-default-fg-color--lighter);
|
|
9
|
+
background-size: var(--permalink-size);
|
|
10
|
+
mask-size: var(--permalink-size);
|
|
11
|
+
-webkit-mask-size: var(--permalink-size);
|
|
12
|
+
mask-repeat: no-repeat;
|
|
13
|
+
-webkit-mask-repeat: no-repeat;
|
|
14
|
+
visibility: visible;
|
|
15
|
+
mask-image: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" version="1.1" width="16" height="16" aria-hidden="true"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg>');
|
|
16
|
+
-webkit-mask-image: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" version="1.1" width="16" height="16" aria-hidden="true"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg>');
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
[id]:target .headerlink {
|
|
20
|
+
background-color: var(--md-typeset-a-color);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
.headerlink:hover {
|
|
24
|
+
background-color: var(--md-accent-fg-color) !important;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
@media screen and (min-width: 76.25em) {
|
|
28
|
+
h1, h2, h3, h4, h5, h6 {
|
|
29
|
+
display: flex;
|
|
30
|
+
align-items: center;
|
|
31
|
+
flex-direction: row;
|
|
32
|
+
column-gap: 0.2em; /* fixes spaces in titles */
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
.headerlink {
|
|
36
|
+
order: -1;
|
|
37
|
+
margin-left: calc(var(--permalink-size) * -1 - var(--permalink-spacing)) !important;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: Examples
|
|
3
|
+
---
|
|
1
4
|
|
|
2
5
|
# Examples
|
|
3
6
|
|
|
@@ -225,7 +228,7 @@ Here is an example from MS COCO “captions” JSON which employs separate secti
|
|
|
225
228
|
}
|
|
226
229
|
```
|
|
227
230
|
|
|
228
|
-
Note how complicated the setup is. Every image is references by the name, and the metadata for this file is keyed by the “id” field. This same field is references later in the “annotations
|
|
231
|
+
Note how complicated the setup is. Every image is references by the name, and the metadata for this file is keyed by the “id” field. This same field is references later in the “annotations” array, which is present in JSON files describing captions and the detected instances. The categories for the instances are stored in the “categories” array.
|
|
229
232
|
|
|
230
233
|
However, Datachain can easily parse the entire COCO structure via several reading and merging operators:
|
|
231
234
|
|
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: Welcome to DataChain
|
|
3
|
+
---
|
|
1
4
|
# <a class="main-header-link" href="/" ><img style="display: inline-block;" src="/assets/datachain.svg" alt="DataChain"> <span style="display: inline-block;"> DataChain</span></a>
|
|
2
5
|
|
|
3
6
|
<style>
|
|
@@ -83,7 +86,7 @@ The following pages provide detailed documentation on DataChain's features, arch
|
|
|
83
86
|
- [🏃🏼♂️ Quick Start](quick-start.md): Get up and running with DataChain in no time.
|
|
84
87
|
- [🎯 Examples](examples.md): Explore practical examples and use cases.
|
|
85
88
|
- [📚 Tutorials](tutorials.md): Learn how to use DataChain for specific tasks.
|
|
86
|
-
- [
|
|
89
|
+
- [🐍 API Reference](references/index.md): Dive into the technical details and API reference.
|
|
87
90
|
- [🤝 Contributing](contributing.md): Learn how to contribute to DataChain.
|
|
88
91
|
|
|
89
92
|
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: Tutorials
|
|
3
|
+
---
|
|
4
|
+
|
|
1
5
|
# Tutorials
|
|
2
6
|
|
|
3
7
|
* Multimodal: [GitHub](https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb) or [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb)
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
|
|
5
|
+
from datachain import C, DataChain
|
|
6
|
+
from datachain.lib.data_model import ModelStore
|
|
7
|
+
from datachain.lib.meta_formats import gen_datamodel_code
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# Sample model for static JSON model
|
|
11
|
+
class LicenseModel(BaseModel):
|
|
12
|
+
url: str
|
|
13
|
+
id: int
|
|
14
|
+
name: str
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
LicenseFeature = ModelStore.register(LicenseModel)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# Sample model for static CSV model
|
|
21
|
+
class ChatDialog(BaseModel):
|
|
22
|
+
id: Optional[int] = None
|
|
23
|
+
count: Optional[int] = None
|
|
24
|
+
sender: Optional[str] = None
|
|
25
|
+
text: Optional[str] = None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
ChatFeature = ModelStore.register(ChatDialog)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def main():
|
|
32
|
+
# Dynamic JSONl schema from 2 objects
|
|
33
|
+
uri = "gs://datachain-demo/jsonl/object.jsonl"
|
|
34
|
+
jsonl_ds = DataChain.from_json(uri, format="jsonl", anon="True")
|
|
35
|
+
jsonl_ds.show()
|
|
36
|
+
|
|
37
|
+
# Dynamic JSON schema from 200 OpenImage json-pairs with validation errors
|
|
38
|
+
uri = "gs://datachain-demo/openimages-v6-test-jsonpairs/*json"
|
|
39
|
+
schema_uri = (
|
|
40
|
+
"gs://datachain-demo/openimages-v6-test-jsonpairs/08392c290ecc9d2a.json"
|
|
41
|
+
)
|
|
42
|
+
json_pairs_ds = DataChain.from_json(
|
|
43
|
+
uri, schema_from=schema_uri, jmespath="@", model_name="OpenImage", anon="True"
|
|
44
|
+
)
|
|
45
|
+
json_pairs_ds.show()
|
|
46
|
+
|
|
47
|
+
uri = "gs://datachain-demo/coco2017/annotations_captions/"
|
|
48
|
+
|
|
49
|
+
# Print JSON schema in Pydantic format from main COCO annotation
|
|
50
|
+
chain = DataChain.from_storage(uri, anon="True").filter(
|
|
51
|
+
C("file.path").glob("*.json")
|
|
52
|
+
)
|
|
53
|
+
file = next(chain.limit(1).collect("file"))
|
|
54
|
+
print(gen_datamodel_code(file, jmespath="@", model_name="Coco"))
|
|
55
|
+
|
|
56
|
+
# Static JSON schema test parsing 3/7 objects
|
|
57
|
+
static_json_ds = DataChain.from_json(
|
|
58
|
+
uri, jmespath="licenses", spec=LicenseFeature, nrows=3, anon="True"
|
|
59
|
+
)
|
|
60
|
+
static_json_ds.show()
|
|
61
|
+
|
|
62
|
+
# Dynamic JSON schema test parsing 5K objects
|
|
63
|
+
dynamic_json_ds = DataChain.from_json(uri, jmespath="images", anon="True")
|
|
64
|
+
print(dynamic_json_ds.to_pandas())
|
|
65
|
+
|
|
66
|
+
# Static CSV with header schema test parsing 3.5K objects
|
|
67
|
+
uri = "gs://datachain-demo/chatbot-csv/"
|
|
68
|
+
static_csv_ds = DataChain.from_csv(
|
|
69
|
+
uri, output=ChatDialog, object_name="chat", anon="True"
|
|
70
|
+
)
|
|
71
|
+
static_csv_ds.print_schema()
|
|
72
|
+
static_csv_ds.show()
|
|
73
|
+
|
|
74
|
+
# Dynamic CSV with header schema test parsing 3/3M objects
|
|
75
|
+
uri = "gs://datachain-demo/laion-aesthetics-csv/laion_aesthetics_1024_33M_1.csv"
|
|
76
|
+
dynamic_csv_ds = DataChain.from_csv(uri, object_name="laion", nrows=3, anon="True")
|
|
77
|
+
dynamic_csv_ds.print_schema()
|
|
78
|
+
dynamic_csv_ds.show()
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
if __name__ == "__main__":
|
|
82
|
+
main()
|
|
@@ -5,6 +5,7 @@ To install the required dependencies:
|
|
|
5
5
|
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
+
import multiprocessing
|
|
8
9
|
import os
|
|
9
10
|
from posixpath import basename
|
|
10
11
|
|
|
@@ -12,17 +13,18 @@ import torch
|
|
|
12
13
|
from torch import nn, optim
|
|
13
14
|
from torch.utils.data import DataLoader
|
|
14
15
|
from torchvision.transforms import v2
|
|
16
|
+
from tqdm import tqdm
|
|
15
17
|
|
|
16
18
|
from datachain import C, DataChain
|
|
17
19
|
from datachain.torch import label_to_int
|
|
18
20
|
|
|
19
21
|
STORAGE = "gs://datachain-demo/dogs-and-cats/"
|
|
20
|
-
NUM_EPOCHS = os.getenv("NUM_EPOCHS", "3")
|
|
22
|
+
NUM_EPOCHS = int(os.getenv("NUM_EPOCHS", "3"))
|
|
21
23
|
|
|
22
24
|
# Define transformation for data preprocessing
|
|
23
25
|
transform = v2.Compose(
|
|
24
26
|
[
|
|
25
|
-
v2.
|
|
27
|
+
v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)]),
|
|
26
28
|
v2.Resize((64, 64)),
|
|
27
29
|
v2.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
|
|
28
30
|
]
|
|
@@ -54,6 +56,7 @@ class CNN(nn.Module):
|
|
|
54
56
|
if __name__ == "__main__":
|
|
55
57
|
ds = (
|
|
56
58
|
DataChain.from_storage(STORAGE, type="image")
|
|
59
|
+
.settings(cache=True, prefetch=25)
|
|
57
60
|
.filter(C("file.path").glob("*.jpg"))
|
|
58
61
|
.map(
|
|
59
62
|
label=lambda path: label_to_int(basename(path)[:3], CLASSES),
|
|
@@ -64,8 +67,10 @@ if __name__ == "__main__":
|
|
|
64
67
|
|
|
65
68
|
train_loader = DataLoader(
|
|
66
69
|
ds.to_pytorch(transform=transform),
|
|
67
|
-
batch_size=
|
|
68
|
-
num_workers=2,
|
|
70
|
+
batch_size=25,
|
|
71
|
+
num_workers=max(4, os.cpu_count() or 2),
|
|
72
|
+
persistent_workers=True,
|
|
73
|
+
multiprocessing_context=multiprocessing.get_context("spawn"),
|
|
69
74
|
)
|
|
70
75
|
|
|
71
76
|
model = CNN()
|
|
@@ -73,19 +78,19 @@ if __name__ == "__main__":
|
|
|
73
78
|
optimizer = optim.Adam(model.parameters(), lr=0.001)
|
|
74
79
|
|
|
75
80
|
# Train the model
|
|
76
|
-
for epoch in range(
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
81
|
+
for epoch in range(NUM_EPOCHS):
|
|
82
|
+
with tqdm(
|
|
83
|
+
train_loader, desc=f"epoch {epoch + 1}/{NUM_EPOCHS}", unit="batch"
|
|
84
|
+
) as loader:
|
|
85
|
+
for data in loader:
|
|
86
|
+
inputs, labels = data
|
|
87
|
+
optimizer.zero_grad()
|
|
88
|
+
|
|
89
|
+
# Forward pass
|
|
90
|
+
outputs = model(inputs)
|
|
91
|
+
loss = criterion(outputs, labels)
|
|
92
|
+
|
|
93
|
+
# Backward pass and optimize
|
|
94
|
+
loss.backward()
|
|
95
|
+
optimizer.step()
|
|
96
|
+
loader.set_postfix(loss=loss.item())
|
|
@@ -12,11 +12,11 @@ from unstructured.cleaners.core import (
|
|
|
12
12
|
group_broken_paragraphs,
|
|
13
13
|
replace_unicode_quotes,
|
|
14
14
|
)
|
|
15
|
-
from unstructured.
|
|
15
|
+
from unstructured.partition.pdf import partition_pdf
|
|
16
|
+
from unstructured_ingest.embed.huggingface import (
|
|
16
17
|
HuggingFaceEmbeddingConfig,
|
|
17
18
|
HuggingFaceEmbeddingEncoder,
|
|
18
19
|
)
|
|
19
|
-
from unstructured.partition.pdf import partition_pdf
|
|
20
20
|
|
|
21
21
|
from datachain import C, DataChain, DataModel, File
|
|
22
22
|
|
|
@@ -43,6 +43,7 @@ def process_pdf(file: File) -> Iterator[Chunk]:
|
|
|
43
43
|
chunks = partition_pdf(file=f, chunking_strategy="by_title", strategy="fast")
|
|
44
44
|
|
|
45
45
|
# Clean the chunks and add new columns
|
|
46
|
+
text_chunks = []
|
|
46
47
|
for chunk in chunks:
|
|
47
48
|
chunk.apply(
|
|
48
49
|
lambda text: clean(
|
|
@@ -51,16 +52,17 @@ def process_pdf(file: File) -> Iterator[Chunk]:
|
|
|
51
52
|
)
|
|
52
53
|
chunk.apply(replace_unicode_quotes)
|
|
53
54
|
chunk.apply(group_broken_paragraphs)
|
|
55
|
+
text_chunks.append({"text": str(chunk)})
|
|
54
56
|
|
|
55
57
|
# create embeddings
|
|
56
|
-
chunks_embedded = embedding_encoder.embed_documents(
|
|
58
|
+
chunks_embedded = embedding_encoder.embed_documents(text_chunks)
|
|
57
59
|
|
|
58
60
|
# Add new rows to DataChain
|
|
59
61
|
for chunk in chunks_embedded:
|
|
60
62
|
yield Chunk(
|
|
61
63
|
key=file.path,
|
|
62
|
-
text=chunk.text,
|
|
63
|
-
embeddings=chunk.embeddings,
|
|
64
|
+
text=chunk.get("text"),
|
|
65
|
+
embeddings=chunk.get("embeddings"),
|
|
64
66
|
)
|
|
65
67
|
|
|
66
68
|
|
|
@@ -27,7 +27,6 @@ theme:
|
|
|
27
27
|
- navigation.tabs
|
|
28
28
|
- navigation.path
|
|
29
29
|
- navigation.top
|
|
30
|
-
- navigation.prune
|
|
31
30
|
- navigation.footer
|
|
32
31
|
- toc.follow
|
|
33
32
|
- content.action.edit
|
|
@@ -37,7 +36,6 @@ theme:
|
|
|
37
36
|
- content.tooltips
|
|
38
37
|
- search.highlight
|
|
39
38
|
- search.suggest
|
|
40
|
-
- navigation.sections
|
|
41
39
|
|
|
42
40
|
palette:
|
|
43
41
|
# Palette toggle for automatic mode
|
|
@@ -56,8 +54,8 @@ theme:
|
|
|
56
54
|
# Palette toggle for dark mode
|
|
57
55
|
- media: "(prefers-color-scheme: dark)"
|
|
58
56
|
scheme: slate
|
|
59
|
-
primary:
|
|
60
|
-
accent:
|
|
57
|
+
primary: teal
|
|
58
|
+
accent: teal
|
|
61
59
|
toggle:
|
|
62
60
|
icon: material/weather-night
|
|
63
61
|
name: Switch to system preference
|
|
@@ -68,18 +66,18 @@ nav:
|
|
|
68
66
|
- 🏃🏼♂️ Quick Start: quick-start.md
|
|
69
67
|
- 🎯 Examples: examples.md
|
|
70
68
|
- 📚 Tutorials: tutorials.md
|
|
71
|
-
- 🐍 API Reference:
|
|
69
|
+
- 🐍 API Reference:
|
|
70
|
+
- Overview: references/index.md
|
|
71
|
+
- DataChain: references/datachain.md
|
|
72
|
+
- DataType: references/datatype.md
|
|
73
|
+
- File: references/file.md
|
|
74
|
+
- UDF: references/udf.md
|
|
75
|
+
- Torch: references/torch.md
|
|
76
|
+
- SQL: references/sql.md
|
|
72
77
|
- 🤝 Contributing: contributing.md
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
- references/datatype.md
|
|
77
|
-
- references/file.md
|
|
78
|
-
- references/udf.md
|
|
79
|
-
- references/torch.md
|
|
80
|
-
- references/sql.md
|
|
81
|
-
- DataChain Website: https://datachain.ai" target="_blank"
|
|
82
|
-
- Studio: https://studio.datachain.ai" target="_blank"
|
|
78
|
+
|
|
79
|
+
- DataChain Website ↗: https://datachain.ai" target="_blank"
|
|
80
|
+
- Studio ↗: https://studio.datachain.ai" target="_blank"
|
|
83
81
|
|
|
84
82
|
markdown_extensions:
|
|
85
83
|
- abbr
|
|
@@ -105,7 +103,11 @@ markdown_extensions:
|
|
|
105
103
|
- pymdownx.tilde
|
|
106
104
|
- tables
|
|
107
105
|
- toc:
|
|
108
|
-
permalink:
|
|
106
|
+
permalink: ''
|
|
107
|
+
|
|
108
|
+
# Custom permalink style: https://github.com/squidfunk/mkdocs-material/discussions/3535
|
|
109
|
+
extra_css:
|
|
110
|
+
- css/github-permalink-style.css
|
|
109
111
|
|
|
110
112
|
extra:
|
|
111
113
|
social:
|
|
@@ -48,7 +48,8 @@ dependencies = [
|
|
|
48
48
|
"iterative-telemetry>=0.0.9",
|
|
49
49
|
"platformdirs",
|
|
50
50
|
"dvc-studio-client>=0.21,<1",
|
|
51
|
-
"tabulate"
|
|
51
|
+
"tabulate",
|
|
52
|
+
"websockets"
|
|
52
53
|
]
|
|
53
54
|
|
|
54
55
|
[project.optional-dependencies]
|
|
@@ -104,14 +105,14 @@ dev = [
|
|
|
104
105
|
]
|
|
105
106
|
examples = [
|
|
106
107
|
"datachain[tests]",
|
|
107
|
-
"numpy>=1,<2",
|
|
108
108
|
"defusedxml",
|
|
109
109
|
"accelerate",
|
|
110
|
-
"
|
|
110
|
+
"unstructured_ingest[embed-huggingface]",
|
|
111
|
+
"unstructured[pdf]",
|
|
111
112
|
"pdfplumber==0.11.4",
|
|
112
113
|
"huggingface_hub[hf_transfer]",
|
|
113
114
|
"onnx==1.16.1",
|
|
114
|
-
"ultralytics==8.3.
|
|
115
|
+
"ultralytics==8.3.50"
|
|
115
116
|
]
|
|
116
117
|
|
|
117
118
|
[project.urls]
|