datachain 0.2.12__py3-none-any.whl → 0.2.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -1,4 +1,5 @@
1
1
  import ast
2
+ import glob
2
3
  import io
3
4
  import json
4
5
  import logging
@@ -709,7 +710,12 @@ class Catalog:
709
710
 
710
711
  client_config = client_config or self.client_config
711
712
  client, path = self.parse_url(source, **client_config)
712
- prefix = posixpath.dirname(path)
713
+ stem = os.path.basename(os.path.normpath(path))
714
+ prefix = (
715
+ posixpath.dirname(path)
716
+ if glob.has_magic(stem) or client.fs.isfile(source)
717
+ else path
718
+ )
713
719
  storage_dataset_name = Storage.dataset_name(
714
720
  client.uri, posixpath.join(prefix, "")
715
721
  )
datachain/cli.py CHANGED
@@ -491,6 +491,7 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
491
491
  type=int,
492
492
  help="Dataset version",
493
493
  )
494
+ show_parser.add_argument("--schema", action="store_true", help="Show schema")
494
495
  add_show_args(show_parser)
495
496
 
496
497
  query_parser = subp.add_parser(
@@ -816,10 +817,15 @@ def show(
816
817
  offset: int = 0,
817
818
  columns: Sequence[str] = (),
818
819
  no_collapse: bool = False,
820
+ schema: bool = False,
819
821
  ) -> None:
822
+ from datachain.lib.dc import DataChain
820
823
  from datachain.query import DatasetQuery
821
824
  from datachain.utils import show_records
822
825
 
826
+ dataset = catalog.get_dataset(name)
827
+ dataset_version = dataset.get_version(version or dataset.latest_version)
828
+
823
829
  query = (
824
830
  DatasetQuery(name=name, version=version, catalog=catalog)
825
831
  .select(*columns)
@@ -828,6 +834,10 @@ def show(
828
834
  )
829
835
  records = query.to_db_records()
830
836
  show_records(records, collapse_columns=not no_collapse)
837
+ if schema and dataset_version.feature_schema:
838
+ print("\nSchema:")
839
+ dc = DataChain(name=name, version=version, catalog=catalog)
840
+ dc.print_schema()
831
841
 
832
842
 
833
843
  def query(
@@ -1013,6 +1023,7 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
1013
1023
  offset=args.offset,
1014
1024
  columns=args.columns,
1015
1025
  no_collapse=args.no_collapse,
1026
+ schema=args.schema,
1016
1027
  )
1017
1028
  elif args.command == "rm-dataset":
1018
1029
  rm_dataset(catalog, args.name, version=args.version, force=args.force)
datachain/lib/file.py CHANGED
@@ -20,7 +20,7 @@ from datachain.cache import UniqueId
20
20
  from datachain.client.fileslice import FileSlice
21
21
  from datachain.lib.data_model import DataModel
22
22
  from datachain.lib.utils import DataChainError
23
- from datachain.sql.types import JSON, Int, String
23
+ from datachain.sql.types import JSON, Boolean, DateTime, Int, String
24
24
  from datachain.utils import TIME_ZERO
25
25
 
26
26
  if TYPE_CHECKING:
@@ -126,11 +126,13 @@ class File(DataModel):
126
126
  "source": String,
127
127
  "parent": String,
128
128
  "name": String,
129
+ "size": Int,
129
130
  "version": String,
130
131
  "etag": String,
131
- "size": Int,
132
- "vtype": String,
132
+ "is_latest": Boolean,
133
+ "last_modified": DateTime,
133
134
  "location": JSON,
135
+ "vtype": String,
134
136
  }
135
137
 
136
138
  _unique_id_keys: ClassVar[list[str]] = [
@@ -214,7 +216,7 @@ class File(DataModel):
214
216
  with self.open(mode="r") as stream:
215
217
  return stream.read()
216
218
 
217
- def write(self, destination: str):
219
+ def save(self, destination: str):
218
220
  """Writes it's content to destination"""
219
221
  with open(destination, mode="wb") as f:
220
222
  f.write(self.read())
@@ -232,7 +234,7 @@ class File(DataModel):
232
234
  dst_dir = os.path.dirname(dst)
233
235
  os.makedirs(dst_dir, exist_ok=True)
234
236
 
235
- self.write(dst)
237
+ self.save(dst)
236
238
 
237
239
  def _set_stream(
238
240
  self,
@@ -330,7 +332,7 @@ class TextFile(File):
330
332
  with self.open() as stream:
331
333
  return stream.read()
332
334
 
333
- def write(self, destination: str):
335
+ def save(self, destination: str):
334
336
  """Writes it's content to destination"""
335
337
  with open(destination, mode="w") as f:
336
338
  f.write(self.read_text())
@@ -344,7 +346,7 @@ class ImageFile(File):
344
346
  fobj = super().read()
345
347
  return Image.open(BytesIO(fobj))
346
348
 
347
- def write(self, destination: str):
349
+ def save(self, destination: str):
348
350
  """Writes it's content to destination"""
349
351
  self.read().save(destination)
350
352
 
@@ -360,21 +362,25 @@ def get_file(type_: Literal["binary", "text", "image"] = "binary"):
360
362
  source: str,
361
363
  parent: str,
362
364
  name: str,
365
+ size: int,
363
366
  version: str,
364
367
  etag: str,
365
- size: int,
366
- vtype: str,
368
+ is_latest: bool,
369
+ last_modified: datetime,
367
370
  location: Optional[Union[dict, list[dict]]],
371
+ vtype: str,
368
372
  ) -> file: # type: ignore[valid-type]
369
373
  return file(
370
374
  source=source,
371
375
  parent=parent,
372
376
  name=name,
377
+ size=size,
373
378
  version=version,
374
379
  etag=etag,
375
- size=size,
376
- vtype=vtype,
380
+ is_latest=is_latest,
381
+ last_modified=last_modified,
377
382
  location=location,
383
+ vtype=vtype,
378
384
  )
379
385
 
380
386
  return get_file_type
@@ -820,8 +820,16 @@ class SQLMutate(SQLClause):
820
820
  args: tuple[ColumnElement, ...]
821
821
 
822
822
  def apply_sql_clause(self, query: Select) -> Select:
823
- subquery = query.subquery()
824
- return sqlalchemy.select(*subquery.c, *self.args).select_from(subquery)
823
+ original_subquery = query.subquery()
824
+ # this is needed for new column to be used in clauses
825
+ # like ORDER BY, otherwise new column is not recognized
826
+ subquery = (
827
+ sqlalchemy.select(*original_subquery.c, *self.args)
828
+ .select_from(original_subquery)
829
+ .subquery()
830
+ )
831
+
832
+ return sqlalchemy.select(*subquery.c).select_from(subquery)
825
833
 
826
834
 
827
835
  @frozen
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.2.12
3
+ Version: 0.2.13
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -45,6 +45,7 @@ Provides-Extra: dev
45
45
  Requires-Dist: datachain[docs,tests] ; extra == 'dev'
46
46
  Requires-Dist: mypy ==1.10.1 ; extra == 'dev'
47
47
  Requires-Dist: types-python-dateutil ; extra == 'dev'
48
+ Requires-Dist: types-pytz ; extra == 'dev'
48
49
  Requires-Dist: types-PyYAML ; extra == 'dev'
49
50
  Requires-Dist: types-requests ; extra == 'dev'
50
51
  Requires-Dist: types-ujson ; extra == 'dev'
@@ -103,20 +104,18 @@ AI 🔗 DataChain
103
104
  DataChain is an open-source Python library for processing and curating unstructured
104
105
  data at scale.
105
106
 
106
- 🤖 AI-Driven Data Curation: Use local ML models, LLM APIs calls to enrich your data.
107
+ 🤖 AI-Driven Data Curation: Use local ML models or LLM APIs calls to enrich your data.
107
108
 
108
- 🚀 GenAI Dataset scale: Handle 10s of milions of files or file snippets.
109
+ 🚀 GenAI Dataset scale: Handle tens of millions of multimodal files.
109
110
 
110
- 🐍 Python-friendly: Use strictly typed `Pydantic`_ objects instead of JSON.
111
+ 🐍 Python-friendly: Use strictly-typed `Pydantic`_ objects instead of JSON.
111
112
 
112
113
 
113
- To ensure efficiency, Datachain supports parallel processing, parallel data
114
- downloads, and out-of-memory computing. It excels at optimizing batch operations.
115
- While most GenAI tools focus on online applications and realtime, DataChain is designed
116
- for offline data processing, data curation and ETL.
114
+ Datachain supports parallel processing, parallel data
115
+ downloads, and out-of-memory computing. It excels at optimizing offline batch operations.
117
116
 
118
- The typical use cases are Computer Vision data curation, LLM analytics
119
- and validation.
117
+ The typical use cases include Computer Vision data curation, LLM analytics,
118
+ and validation of multimodal AI applications.
120
119
 
121
120
 
122
121
  .. code:: console
@@ -128,25 +127,25 @@ and validation.
128
127
  Quick Start
129
128
  -----------
130
129
 
131
- Basic evaluation
132
- ================
130
+ Data curation with a local model
131
+ =================================
133
132
 
134
133
  We will evaluate chatbot dialogs stored as text files in Google Cloud Storage
135
- - 50 files total in the example.
136
- These dialogs involve users looking for better wireless plans chatting with bot.
137
- Our goal is to identify successful dialogs.
134
+ - 50 files total in this example.
135
+ These dialogs involve users chatting with a bot while looking for better wireless plans.
136
+ Our goal is to identify the successful dialogs.
138
137
 
139
- The data used in the examples is publicly available. Please feel free to run this code.
138
+ The data used in the examples is `publicly available`_. The sample code is designed to run on a local machine.
140
139
 
141
- First, we'll use a simple sentiment analysis model. Please install transformers.
140
+ First, we'll show batch inference with a simple sentiment model using the `transformers` library:
142
141
 
143
142
  .. code:: shell
144
143
 
145
144
  pip install transformers
146
145
 
147
- The code below downloads files the cloud, applies function
148
- `is_positive_dialogue_ending()` to each. All files with a positive sentiment
149
- are copied to local directory `output/`.
146
+ The code below downloads files the cloud, and applies a user-defined function
147
+ to each one of them. All files with a positive sentiment
148
+ detected are then copied to the local directory.
150
149
 
151
150
  .. code:: py
152
151
 
@@ -169,7 +168,7 @@ are copied to local directory `output/`.
169
168
  )
170
169
 
171
170
  positive_chain = chain.filter(Column("is_positive") == True)
172
- positive_chain.export_files("./output1")
171
+ positive_chain.export_files("./output")
173
172
 
174
173
  print(f"{positive_chain.count()} files were exported")
175
174
 
@@ -185,11 +184,11 @@ are copied to local directory `output/`.
185
184
  13
186
185
 
187
186
 
188
- LLM judging LLMs dialogs
189
- ==========================
187
+ LLM judging chatbots
188
+ =============================
190
189
 
191
- Finding good dialogs using an LLM can be more efficient. In this example,
192
- we use Mistral with a free API. Please install the package and get a free
190
+ LLMs can work as efficient universal classifiers. In the example below,
191
+ we employ a free API from Mistral to judge the chatbot performance. Please get a free
193
192
  Mistral API key at https://console.mistral.ai
194
193
 
195
194
  .. code:: shell
@@ -197,9 +196,7 @@ Mistral API key at https://console.mistral.ai
197
196
  $ pip install mistralai
198
197
  $ export MISTRAL_API_KEY=_your_key_
199
198
 
200
- Below is a similar code example, but this time using an LLM to evaluate the dialogs.
201
- Note, only 4 threads were used in this example `parallel=4` due to a limitation of
202
- the free LLM service.
199
+ DataChain can parallelize API calls; the free Mistral tier supports up to 4 requests at the same time.
203
200
 
204
201
  .. code:: py
205
202
 
@@ -231,7 +228,7 @@ the free LLM service.
231
228
  print(f"{successful_chain.count()} files were exported")
232
229
 
233
230
 
234
- With the current prompt, we found 31 files considered successful dialogs:
231
+ With the instruction above, the Mistral model considers 31/50 files to hold the successful dialogues:
235
232
 
236
233
  .. code:: shell
237
234
 
@@ -245,11 +242,11 @@ With the current prompt, we found 31 files considered successful dialogs:
245
242
  Serializing Python-objects
246
243
  ==========================
247
244
 
248
- LLM responses contain valuable information for analytics, such as tokens used and the
249
- model. Preserving this information can be beneficial.
245
+ LLM responses may contain valuable information for analytics such as the number of tokens used, or the
246
+ model performance parameters.
250
247
 
251
- Instead of extracting this information from the Mistral data structure (class
252
- `ChatCompletionResponse`), we serialize the entire Python object to the internal DB.
248
+ Instead of extracting this information from the Mistral response data structure (class
249
+ `ChatCompletionResponse`), DataChain can serialize the entire LLM response to the internal DB:
253
250
 
254
251
 
255
252
  .. code:: py
@@ -297,21 +294,23 @@ Output:
297
294
  64.0% dialogs were successful
298
295
 
299
296
 
300
- Complex Python data structures
297
+ Iterating over Python data structures
301
298
  =============================================
302
299
 
303
- In the previous examples, a few dataset were saved in the embedded database
304
- (`SQLite`_ in directory `.datachain`).
305
- These datasets are versioned, and can be accessed using
300
+ In the previous examples, datasets were saved in the embedded database
301
+ (`SQLite`_ in folder `.datachain` of the working directory).
302
+ These datasets were automatically versioned, and can be accessed using
306
303
  `DataChain.from_dataset("dataset_name")`.
307
304
 
305
+ Here is how to retrieve a saved dataset and iterate over the objects:
306
+
308
307
  .. code:: py
309
308
 
310
309
  chain = DataChain.from_dataset("response")
311
310
 
312
- # Iterating one-by-one: out of memory
311
+ # Iterating one-by-one: support out-of-memory workflow
313
312
  for file, response in chain.limit(5).collect("file", "response"):
314
- # You work with Python objects
313
+ # verify the collected Python objects
315
314
  assert isinstance(response, ChatCompletionResponse)
316
315
 
317
316
  status = response.choices[0].message.content[:7]
@@ -332,9 +331,8 @@ Output:
332
331
  Vectorized analytics over Python objects
333
332
  ========================================
334
333
 
335
- Some operations can be efficiently run inside the DB without deserializing Python objects.
336
- Let's calculate the cost of using LLM APIs in a vectorized way.
337
- Mistral calls cost $2 per 1M input tokens and $6 per 1M output tokens:
334
+ Some operations can run inside the DB without deserialization.
335
+ For instance, let's calculate the total cost of using the LLM APIs, assuming the Mixtral call costs $2 per 1M input tokens and $6 per 1M output tokens:
338
336
 
339
337
  .. code:: py
340
338
 
@@ -406,6 +404,7 @@ Community and Support
406
404
  .. github-only
407
405
  .. _Contributor Guide: CONTRIBUTING.rst
408
406
  .. _Pydantic: https://github.com/pydantic/pydantic
407
+ .. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
409
408
  .. _SQLite: https://www.sqlite.org/
410
409
  .. _Getting Started: https://datachain.dvc.ai/
411
410
  .. |Flowchart| image:: https://github.com/iterative/datachain/blob/main/docs/assets/flowchart.png?raw=true
@@ -2,7 +2,7 @@ datachain/__init__.py,sha256=GeyhE-5LgfJav2OKYGaieP2lBvf2Gm-ihj7thnK9zjI,800
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=CKCFQJ0CbB3r04S7mUTXxriKzPnOvdUaVPXjM8vCtJw,7644
4
4
  datachain/cache.py,sha256=N6PCEFJlWRpq7f_zeBNoaURFCJFAV7ibsLJqyiMHbBg,4207
5
- datachain/cli.py,sha256=MSOID2t-kesk5Z80uoepN63rqvB7iZxaWYLqkiWehkQ,32628
5
+ datachain/cli.py,sha256=Twb6BXjNxfAAGj42dUOJ7Ah5etkrTDVfMzAmINWUSOI,33104
6
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
8
8
  datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
@@ -17,7 +17,7 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
18
18
  datachain/utils.py,sha256=kgH5NPj47eC_KrFTd6ZS206lKVhnJVFt5XsqkK6ppTc,12483
19
19
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
20
- datachain/catalog/catalog.py,sha256=u8tvWooIon9ju59q8-Re_iqflgbCB-JMZD8n2UC4iag,80397
20
+ datachain/catalog/catalog.py,sha256=ab-PLPa9CMeHCo9asHjkqw4mZ6tHM4x8bsswfMtr65w,80575
21
21
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
22
22
  datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
23
23
  datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
@@ -43,7 +43,7 @@ datachain/lib/clip.py,sha256=16u4b_y2Y15nUS2UN_8ximMo6r_-_4IQpmct2ol-e-g,5730
43
43
  datachain/lib/data_model.py,sha256=jPYDmTYbixy4LhdToOyvldYGYZxblhp6Tn4MF-VAd-o,1495
44
44
  datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
45
45
  datachain/lib/dc.py,sha256=KboCSSyjZ69hIpyjgza4HindFwO7L1Usxa0769N57NA,50561
46
- datachain/lib/file.py,sha256=xiLHaqyl4rqcBLGD62YD3aBIAOmX4EBVucxIncpRi80,11916
46
+ datachain/lib/file.py,sha256=Y1QQ1pBSESjlA9n6_ukc3YtugeiTeF12xcncyfdCL2k,12128
47
47
  datachain/lib/image.py,sha256=TgYhRhzd4nkytfFMeykQkPyzqb5Le_-tU81unVMPn4Q,2328
48
48
  datachain/lib/meta_formats.py,sha256=Z2NVH5X4N2rrj5kFxKsHKq3zD4kaRHbDCx3oiUEKYUk,6920
49
49
  datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
@@ -66,7 +66,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=Bh8L4zA66XRhQxmONvLvn94_i8MBMYg
66
66
  datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
67
67
  datachain/query/batch.py,sha256=j-_ZcuQra2Ro3Wj4crtqQCg-7xuv-p84hr4QHdvT7as,3479
68
68
  datachain/query/builtins.py,sha256=ZKNs49t8Oa_OaboCBIEqtXZt7c1Qe9OR_C_HpoDriIU,2781
69
- datachain/query/dataset.py,sha256=m0bDQK_xXB85KPdJpH3OHdW6WJd1_PMgi01GRcWiiSg,61280
69
+ datachain/query/dataset.py,sha256=jOMdvsQIMZq1hYPfqR_iKzGSlJ8m-7Wz75QxdFHdfwY,61567
70
70
  datachain/query/dispatch.py,sha256=oGX9ZuoKWPB_EyqAZD_eULcO3OejY44_keSmFS6SHT0,13315
71
71
  datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
72
72
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -92,9 +92,9 @@ datachain/sql/sqlite/base.py,sha256=nPMF6_FF04hclDNZev_YfxMgbJAsWEdF-rU2pUhqBtc,
92
92
  datachain/sql/sqlite/types.py,sha256=oP93nLfTBaYnN0z_4Dsv-HZm8j9rrUf1esMM-z3JLbg,1754
93
93
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
94
94
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
95
- datachain-0.2.12.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
96
- datachain-0.2.12.dist-info/METADATA,sha256=QfDhY5jkblcb94A5CxT-ELhDcwDzZq1ju4cPQXHDEkY,14333
97
- datachain-0.2.12.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
98
- datachain-0.2.12.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
99
- datachain-0.2.12.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
100
- datachain-0.2.12.dist-info/RECORD,,
95
+ datachain-0.2.13.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
96
+ datachain-0.2.13.dist-info/METADATA,sha256=jiEob-wl7pePOekp9tVY6h00czklAsktsmw910EvZbo,14619
97
+ datachain-0.2.13.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
98
+ datachain-0.2.13.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
99
+ datachain-0.2.13.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
100
+ datachain-0.2.13.dist-info/RECORD,,