PyPI - datachain - Versions diffs - 0.2.12__py3-none-any.whl → 0.2.13__py3-none-any.whl - Mend

datachain 0.2.12py3-none-any.whl → 0.2.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (10) hide show

datachain/catalog/catalog.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import ast
+import glob
 import io
 import json
 import logging
@@ -709,7 +710,12 @@ class Catalog:
         client_config = client_config or self.client_config
         client, path = self.parse_url(source, **client_config)
-        prefix = posixpath.dirname(path)
+        stem = os.path.basename(os.path.normpath(path))
+        prefix = (
+            posixpath.dirname(path)
+            if glob.has_magic(stem) or client.fs.isfile(source)
+            else path
+        )
         storage_dataset_name = Storage.dataset_name(
             client.uri, posixpath.join(prefix, "")
         )

datachain/cli.py CHANGED Viewed

@@ -491,6 +491,7 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         type=int,
         help="Dataset version",
     )
+    show_parser.add_argument("--schema", action="store_true", help="Show schema")
     add_show_args(show_parser)
     query_parser = subp.add_parser(
@@ -816,10 +817,15 @@ def show(
     offset: int = 0,
     columns: Sequence[str] = (),
     no_collapse: bool = False,
+    schema: bool = False,
 ) -> None:
+    from datachain.lib.dc import DataChain
     from datachain.query import DatasetQuery
     from datachain.utils import show_records
+    dataset = catalog.get_dataset(name)
+    dataset_version = dataset.get_version(version or dataset.latest_version)
     query = (
         DatasetQuery(name=name, version=version, catalog=catalog)
         .select(*columns)
@@ -828,6 +834,10 @@ def show(
     )
     records = query.to_db_records()
     show_records(records, collapse_columns=not no_collapse)
+    if schema and dataset_version.feature_schema:
+        print("\nSchema:")
+        dc = DataChain(name=name, version=version, catalog=catalog)
+        dc.print_schema()
 def query(
@@ -1013,6 +1023,7 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
                 offset=args.offset,
                 columns=args.columns,
                 no_collapse=args.no_collapse,
+                schema=args.schema,
             )
         elif args.command == "rm-dataset":
             rm_dataset(catalog, args.name, version=args.version, force=args.force)

datachain/lib/file.py CHANGED Viewed

@@ -20,7 +20,7 @@ from datachain.cache import UniqueId
 from datachain.client.fileslice import FileSlice
 from datachain.lib.data_model import DataModel
 from datachain.lib.utils import DataChainError
-from datachain.sql.types import JSON, Int, String
+from datachain.sql.types import JSON, Boolean, DateTime, Int, String
 from datachain.utils import TIME_ZERO
 if TYPE_CHECKING:
@@ -126,11 +126,13 @@ class File(DataModel):
         "source": String,
         "parent": String,
         "name": String,
+        "size": Int,
         "version": String,
         "etag": String,
-        "size": Int,
-        "vtype": String,
+        "is_latest": Boolean,
+        "last_modified": DateTime,
         "location": JSON,
+        "vtype": String,
     }
     _unique_id_keys: ClassVar[list[str]] = [
@@ -214,7 +216,7 @@ class File(DataModel):
         with self.open(mode="r") as stream:
             return stream.read()
-    def write(self, destination: str):
+    def save(self, destination: str):
         """Writes it's content to destination"""
         with open(destination, mode="wb") as f:
             f.write(self.read())
@@ -232,7 +234,7 @@ class File(DataModel):
         dst_dir = os.path.dirname(dst)
         os.makedirs(dst_dir, exist_ok=True)
-        self.write(dst)
+        self.save(dst)
     def _set_stream(
         self,
@@ -330,7 +332,7 @@ class TextFile(File):
         with self.open() as stream:
             return stream.read()
-    def write(self, destination: str):
+    def save(self, destination: str):
         """Writes it's content to destination"""
         with open(destination, mode="w") as f:
             f.write(self.read_text())
@@ -344,7 +346,7 @@ class ImageFile(File):
         fobj = super().read()
         return Image.open(BytesIO(fobj))
-    def write(self, destination: str):
+    def save(self, destination: str):
         """Writes it's content to destination"""
         self.read().save(destination)
@@ -360,21 +362,25 @@ def get_file(type_: Literal["binary", "text", "image"] = "binary"):
         source: str,
         parent: str,
         name: str,
+        size: int,
         version: str,
         etag: str,
-        size: int,
-        vtype: str,
+        is_latest: bool,
+        last_modified: datetime,
         location: Optional[Union[dict, list[dict]]],
+        vtype: str,
     ) -> file:  # type: ignore[valid-type]
         return file(
             source=source,
             parent=parent,
             name=name,
+            size=size,
             version=version,
             etag=etag,
-            size=size,
-            vtype=vtype,
+            is_latest=is_latest,
+            last_modified=last_modified,
             location=location,
+            vtype=vtype,
         )
     return get_file_type

datachain/query/dataset.py CHANGED Viewed

@@ -820,8 +820,16 @@ class SQLMutate(SQLClause):
     args: tuple[ColumnElement, ...]
     def apply_sql_clause(self, query: Select) -> Select:
-        subquery = query.subquery()
-        return sqlalchemy.select(*subquery.c, *self.args).select_from(subquery)
+        original_subquery = query.subquery()
+        # this is needed for new column to be used in clauses
+        # like ORDER BY, otherwise new column is not recognized
+        subquery = (
+            sqlalchemy.select(*original_subquery.c, *self.args)
+            .select_from(original_subquery)
+            .subquery()
+        )
+        return sqlalchemy.select(*subquery.c).select_from(subquery)
 @frozen

{datachain-0.2.12.dist-info → datachain-0.2.13.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datachain
-Version: 0.2.12
+Version: 0.2.13
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0
@@ -45,6 +45,7 @@ Provides-Extra: dev
 Requires-Dist: datachain[docs,tests] ; extra == 'dev'
 Requires-Dist: mypy ==1.10.1 ; extra == 'dev'
 Requires-Dist: types-python-dateutil ; extra == 'dev'
+Requires-Dist: types-pytz ; extra == 'dev'
 Requires-Dist: types-PyYAML ; extra == 'dev'
 Requires-Dist: types-requests ; extra == 'dev'
 Requires-Dist: types-ujson ; extra == 'dev'
@@ -103,20 +104,18 @@ AI 🔗 DataChain
 DataChain is an open-source Python library for processing and curating unstructured
 data at scale.
-🤖 AI-Driven Data Curation: Use local ML models, LLM APIs calls to enrich your data.
+🤖 AI-Driven Data Curation: Use local ML models or LLM APIs calls to enrich your data.
-🚀 GenAI Dataset scale: Handle 10s of milions of files or file snippets.
+🚀 GenAI Dataset scale: Handle tens of millions of multimodal files.
-🐍 Python-friendly: Use strictly typed `Pydantic`_ objects instead of JSON.
+🐍 Python-friendly: Use strictly-typed `Pydantic`_ objects instead of JSON.
-To ensure efficiency, Datachain supports parallel processing, parallel data
-downloads, and out-of-memory computing. It excels at optimizing batch operations.
-While most GenAI tools focus on online applications and realtime, DataChain is designed
-for offline data processing, data curation and ETL.
+Datachain supports parallel processing, parallel data
+downloads, and out-of-memory computing. It excels at optimizing offline batch operations.
-The typical use cases are Computer Vision data curation, LLM analytics
-and validation.
+The typical use cases include Computer Vision data curation, LLM analytics,
+and validation of multimodal AI applications.
 .. code:: console
@@ -128,25 +127,25 @@ and validation.
 Quick Start
 -----------
-Basic evaluation
-================
+Data curation with a local model
+=================================
 We will evaluate chatbot dialogs stored as text files in Google Cloud Storage
-- 50 files total in the example.
-These dialogs involve users looking for better wireless plans chatting with bot.
-Our goal is to identify successful dialogs.
+- 50 files total in this example.
+These dialogs involve users chatting with a bot while looking for better wireless plans.
+Our goal is to identify the successful dialogs.
-The data used in the examples is publicly available. Please feel free to run this code.
+The data used in the examples is `publicly available`_. The sample code is designed to run on a local machine.
-First, we'll use a simple sentiment analysis model. Please install transformers.
+First, we'll show batch inference with a simple sentiment model using the `transformers` library:
 .. code:: shell
     pip install transformers
-The code below downloads files the cloud, applies function
-`is_positive_dialogue_ending()` to each. All files with a positive sentiment
-are copied to local directory `output/`.
+The code below downloads files the cloud, and applies a user-defined function
+to each one of them. All files with a positive sentiment
+detected are then copied to the local directory.
 .. code:: py
@@ -169,7 +168,7 @@ are copied to local directory `output/`.
     )
     positive_chain = chain.filter(Column("is_positive") == True)
-    positive_chain.export_files("./output1")
+    positive_chain.export_files("./output")
     print(f"{positive_chain.count()} files were exported")
@@ -185,11 +184,11 @@ are copied to local directory `output/`.
     13
-LLM judging LLMs dialogs
-==========================
+LLM judging chatbots
+=============================
-Finding good dialogs using an LLM can be more efficient. In this example,
-we use Mistral with a free API. Please install the package and get a free
+LLMs can work as efficient universal classifiers. In the example below,
+we employ a free API from Mistral to judge the chatbot performance. Please get a free
 Mistral API key at https://console.mistral.ai
 .. code:: shell
@@ -197,9 +196,7 @@ Mistral API key at https://console.mistral.ai
     $ pip install mistralai
     $ export MISTRAL_API_KEY=_your_key_
-Below is a similar code example, but this time using an LLM to evaluate the dialogs.
-Note, only 4 threads were used in this example `parallel=4` due to a limitation of
-the free LLM service.
+DataChain can parallelize API calls; the free Mistral tier supports up to 4 requests at the same time.
 .. code:: py
@@ -231,7 +228,7 @@ the free LLM service.
     print(f"{successful_chain.count()} files were exported")
-With the current prompt, we found 31 files considered successful dialogs:
+With the instruction above, the Mistral model considers 31/50 files to hold the successful dialogues:
 .. code:: shell
@@ -245,11 +242,11 @@ With the current prompt, we found 31 files considered successful dialogs:
 Serializing Python-objects
 ==========================
-LLM responses contain valuable information for analytics, such as tokens used and the
-model. Preserving this information can be beneficial.
+LLM responses may contain valuable information for analytics – such as the number of tokens used, or the
+model performance parameters.
-Instead of extracting this information from the Mistral data structure (class
-`ChatCompletionResponse`), we serialize the entire Python object to the internal DB.
+Instead of extracting this information from the Mistral response data structure (class
+`ChatCompletionResponse`), DataChain can serialize the entire LLM response to the internal DB:
 .. code:: py
@@ -297,21 +294,23 @@ Output:
     64.0% dialogs were successful
-Complex Python data structures
+Iterating over Python data structures
 =============================================
-In the previous examples, a few dataset were saved in the embedded database
-(`SQLite`_ in directory `.datachain`).
-These datasets are versioned, and can be accessed using
+In the previous examples, datasets were saved in the embedded database
+(`SQLite`_ in folder `.datachain` of the working directory).
+These datasets were automatically versioned, and can be accessed using
 `DataChain.from_dataset("dataset_name")`.
+Here is how to retrieve a saved dataset and iterate over the objects:
 .. code:: py
     chain = DataChain.from_dataset("response")
-    # Iterating one-by-one: out of memory
+    # Iterating one-by-one: support out-of-memory workflow
     for file, response in chain.limit(5).collect("file", "response"):
-        # You work with Python objects
+        # verify the collected Python objects
         assert isinstance(response, ChatCompletionResponse)
         status = response.choices[0].message.content[:7]
@@ -332,9 +331,8 @@ Output:
 Vectorized analytics over Python objects
 ========================================
-Some operations can be efficiently run inside the DB without deserializing Python objects.
-Let's calculate the cost of using LLM APIs in a vectorized way.
-Mistral calls cost $2 per 1M input tokens and $6 per 1M output tokens:
+Some operations can run inside the DB without deserialization.
+For instance, let's calculate the total cost of using the LLM APIs, assuming the Mixtral call costs $2 per 1M input tokens and $6 per 1M output tokens:
 .. code:: py
@@ -406,6 +404,7 @@ Community and Support
 .. github-only
 .. _Contributor Guide: CONTRIBUTING.rst
 .. _Pydantic: https://github.com/pydantic/pydantic
+.. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
 .. _SQLite: https://www.sqlite.org/
 .. _Getting Started: https://datachain.dvc.ai/
 .. |Flowchart| image:: https://github.com/iterative/datachain/blob/main/docs/assets/flowchart.png?raw=true

{datachain-0.2.12.dist-info → datachain-0.2.13.dist-info}/RECORD RENAMED Viewed

@@ -2,7 +2,7 @@ datachain/__init__.py,sha256=GeyhE-5LgfJav2OKYGaieP2lBvf2Gm-ihj7thnK9zjI,800
 datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
 datachain/asyn.py,sha256=CKCFQJ0CbB3r04S7mUTXxriKzPnOvdUaVPXjM8vCtJw,7644
 datachain/cache.py,sha256=N6PCEFJlWRpq7f_zeBNoaURFCJFAV7ibsLJqyiMHbBg,4207
-datachain/cli.py,sha256=MSOID2t-kesk5Z80uoepN63rqvB7iZxaWYLqkiWehkQ,32628
+datachain/cli.py,sha256=Twb6BXjNxfAAGj42dUOJ7Ah5etkrTDVfMzAmINWUSOI,33104
 datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
 datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
 datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
@@ -17,7 +17,7 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
 datachain/utils.py,sha256=kgH5NPj47eC_KrFTd6ZS206lKVhnJVFt5XsqkK6ppTc,12483
 datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
-datachain/catalog/catalog.py,sha256=u8tvWooIon9ju59q8-Re_iqflgbCB-JMZD8n2UC4iag,80397
+datachain/catalog/catalog.py,sha256=ab-PLPa9CMeHCo9asHjkqw4mZ6tHM4x8bsswfMtr65w,80575
 datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
 datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
 datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
@@ -43,7 +43,7 @@ datachain/lib/clip.py,sha256=16u4b_y2Y15nUS2UN_8ximMo6r_-_4IQpmct2ol-e-g,5730
 datachain/lib/data_model.py,sha256=jPYDmTYbixy4LhdToOyvldYGYZxblhp6Tn4MF-VAd-o,1495
 datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
 datachain/lib/dc.py,sha256=KboCSSyjZ69hIpyjgza4HindFwO7L1Usxa0769N57NA,50561
-datachain/lib/file.py,sha256=xiLHaqyl4rqcBLGD62YD3aBIAOmX4EBVucxIncpRi80,11916
+datachain/lib/file.py,sha256=Y1QQ1pBSESjlA9n6_ukc3YtugeiTeF12xcncyfdCL2k,12128
 datachain/lib/image.py,sha256=TgYhRhzd4nkytfFMeykQkPyzqb5Le_-tU81unVMPn4Q,2328
 datachain/lib/meta_formats.py,sha256=Z2NVH5X4N2rrj5kFxKsHKq3zD4kaRHbDCx3oiUEKYUk,6920
 datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
@@ -66,7 +66,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=Bh8L4zA66XRhQxmONvLvn94_i8MBMYg
 datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
 datachain/query/batch.py,sha256=j-_ZcuQra2Ro3Wj4crtqQCg-7xuv-p84hr4QHdvT7as,3479
 datachain/query/builtins.py,sha256=ZKNs49t8Oa_OaboCBIEqtXZt7c1Qe9OR_C_HpoDriIU,2781
-datachain/query/dataset.py,sha256=m0bDQK_xXB85KPdJpH3OHdW6WJd1_PMgi01GRcWiiSg,61280
+datachain/query/dataset.py,sha256=jOMdvsQIMZq1hYPfqR_iKzGSlJ8m-7Wz75QxdFHdfwY,61567
 datachain/query/dispatch.py,sha256=oGX9ZuoKWPB_EyqAZD_eULcO3OejY44_keSmFS6SHT0,13315
 datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
 datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -92,9 +92,9 @@ datachain/sql/sqlite/base.py,sha256=nPMF6_FF04hclDNZev_YfxMgbJAsWEdF-rU2pUhqBtc,
 datachain/sql/sqlite/types.py,sha256=oP93nLfTBaYnN0z_4Dsv-HZm8j9rrUf1esMM-z3JLbg,1754
 datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
 datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
-datachain-0.2.12.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
-datachain-0.2.12.dist-info/METADATA,sha256=QfDhY5jkblcb94A5CxT-ELhDcwDzZq1ju4cPQXHDEkY,14333
-datachain-0.2.12.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
-datachain-0.2.12.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
-datachain-0.2.12.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
-datachain-0.2.12.dist-info/RECORD,,
+datachain-0.2.13.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
+datachain-0.2.13.dist-info/METADATA,sha256=jiEob-wl7pePOekp9tVY6h00czklAsktsmw910EvZbo,14619
+datachain-0.2.13.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
+datachain-0.2.13.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
+datachain-0.2.13.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
+datachain-0.2.13.dist-info/RECORD,,

{datachain-0.2.12.dist-info → datachain-0.2.13.dist-info}/LICENSE RENAMED Viewed

File without changes

{datachain-0.2.12.dist-info → datachain-0.2.13.dist-info}/WHEEL RENAMED Viewed

File without changes

{datachain-0.2.12.dist-info → datachain-0.2.13.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datachain-0.2.12.dist-info → datachain-0.2.13.dist-info}/top_level.txt RENAMED Viewed

File without changes

datachain 0.2.12__py3-none-any.whl → 0.2.13__py3-none-any.whl

Potentially problematic release.

datachain 0.2.12py3-none-any.whl → 0.2.13py3-none-any.whl