datamaestro-text 2026.1.1__tar.gz → 2026.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/PKG-INFO +1 -7
  2. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/pyproject.toml +28 -10
  3. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/__init__.py +1 -1
  4. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/com/github/ikat.py +0 -1
  5. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/com/oscar-corpus.py +1 -1
  6. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/edu/stanford/aclimdb.py +1 -1
  7. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/edu/stanford/glove.py +0 -1
  8. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/gov/nist/ir/covid.py +1 -2
  9. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/io/metamind/research/wikitext.py +1 -1
  10. datamaestro_text-2026.2.2/src/datamaestro_text/data/conversation/__init__.py +8 -0
  11. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/data/conversation/base.py +2 -2
  12. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/data/conversation/canard.py +3 -4
  13. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/data/conversation/ikat.py +0 -1
  14. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/data/conversation/orconvqa.py +3 -3
  15. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/data/embeddings.py +1 -0
  16. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/data/ir/__init__.py +1 -1
  17. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/data/ir/base.py +1 -1
  18. datamaestro_text-2026.2.2/src/datamaestro_text/data/ir/data.py +1 -0
  19. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/data/ir/formats.py +2 -1
  20. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/data/ir/stores.py +1 -1
  21. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/data/text.py +1 -0
  22. datamaestro_text-2026.2.2/src/datamaestro_text/datasets/__init__.py +1 -0
  23. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/datasets/irds/data.py +1 -6
  24. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/download/tmdb.py +0 -1
  25. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/transforms/ir/__init__.py +12 -13
  26. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/utils/shuffle.py +1 -1
  27. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/version.py +2 -2
  28. datamaestro_text-2026.1.1/src/datamaestro_text/data/conversation/__init__.py +0 -8
  29. datamaestro_text-2026.1.1/src/datamaestro_text/data/ir/data.py +0 -1
  30. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/.gitignore +0 -0
  31. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/LICENSE +0 -0
  32. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/README.md +0 -0
  33. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/__init__.py +0 -0
  34. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/ai/quac.yaml +0 -0
  35. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml +0 -0
  36. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/com/github/aagohary/canard.py +0 -0
  37. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/com/github/apple/ml-qrecc.py +0 -0
  38. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/com/github/prdwb/orconvqa.py +0 -0
  39. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/com/github/soskek/bookcorpus.yaml +0 -0
  40. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py +0 -0
  41. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/com/microsoft/wikiqa.yaml +0 -0
  42. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/com/sentiment140.py +0 -0
  43. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/com/smashwords/bookcorpus.py +0 -0
  44. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/edu/cornell/nlvr.yaml +0 -0
  45. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/edu/stanford/__init__.py +0 -0
  46. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/edu/stanford/im2p.yaml +0 -0
  47. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/edu/upenn/__init__.py +0 -0
  48. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/edu/upenn/ldc/__init__.py +0 -0
  49. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py +0 -0
  50. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/fr/granddebat.py +0 -0
  51. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/gov/__init__.py +0 -0
  52. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/gov/nist/__init__.py +0 -0
  53. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/gov/nist/trec/__init__.py +0 -0
  54. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/gov/nist/trec/adhoc.py +0 -0
  55. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/gov/nist/trec/clueweb.yaml +0 -0
  56. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/gov/nist/trec/deeplearning.yaml +0 -0
  57. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/gov/nist/trec/index.yaml +0 -0
  58. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/gov/nist/trec/tipster.py +0 -0
  59. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/gov/nist/trec/web.yaml +0 -0
  60. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/io/github/rajpurkar/squad.yaml +0 -0
  61. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/io/github/thunlp/fewrel.py +0 -0
  62. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/io/metamind/research/__init__.py +0 -0
  63. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/net/mattmahoney/enwiki.yaml +0 -0
  64. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/org/acm/recsys/cb2014.yaml +0 -0
  65. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/org/cocodataset/index.yaml +0 -0
  66. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/org/grouplens/movielens.py +0 -0
  67. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/org/universaldependencies/french.py +0 -0
  68. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml +0 -0
  69. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/data/__init__.py +0 -0
  70. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/data/conversation/qrecc.py +0 -0
  71. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/data/debate/__init__.py +0 -0
  72. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/data/debate/granddebat.py +0 -0
  73. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/data/ir/cord19.py +0 -0
  74. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/data/ir/csv.py +0 -0
  75. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/data/ir/huggingface.py +0 -0
  76. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/data/ir/trec.py +0 -0
  77. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/data/ir/utils.py +0 -0
  78. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/data/recommendation.py +0 -0
  79. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/data/tagging.py +0 -0
  80. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/datasets/irds/__init__.py +0 -0
  81. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/datasets/irds/datasets.py +0 -0
  82. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/datasets/irds/helpers.py +0 -0
  83. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/datasets/irds/utils.py +0 -0
  84. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/interfaces/plaintext.py +0 -0
  85. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/interfaces/trec.py +0 -0
  86. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/test/__init__.py +0 -0
  87. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/test/test_datasets.py +0 -0
  88. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/test/test_documented.py +0 -0
  89. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/transforms/__init__.py +0 -0
  90. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/utils/__init__.py +0 -0
  91. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/utils/files.py +0 -0
  92. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/utils/iter.py +0 -0
  93. {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.2}/src/datamaestro_text/utils/randomstream.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datamaestro-text
3
- Version: 2026.1.1
3
+ Version: 2026.2.2
4
4
  Summary: Datamaestro module for text-related datasets
5
5
  Project-URL: Homepage, https://github.com/experimaestro/datamaestro_text
6
6
  Project-URL: Documentation, https://datamaestro-text.readthedocs.io/en/latest/
@@ -25,12 +25,6 @@ Requires-Dist: attrs
25
25
  Requires-Dist: datamaestro>=1.6.2
26
26
  Requires-Dist: experimaestro
27
27
  Requires-Dist: ir-datasets>=0.5.8
28
- Provides-Extra: dev
29
- Requires-Dist: docutils; extra == 'dev'
30
- Requires-Dist: flake8; extra == 'dev'
31
- Requires-Dist: pytest; extra == 'dev'
32
- Requires-Dist: sphinx<8; extra == 'dev'
33
- Requires-Dist: sphobjinv; extra == 'dev'
34
28
  Description-Content-Type: text/markdown
35
29
 
36
30
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![PyPI version](https://badge.fury.io/py/datamaestro-text.svg)](https://badge.fury.io/py/datamaestro-text)
@@ -32,15 +32,6 @@ dependencies = [
32
32
  "experimaestro",
33
33
  ]
34
34
 
35
- [project.optional-dependencies]
36
- dev = [
37
- "pytest",
38
- "docutils",
39
- "sphobjinv",
40
- "flake8",
41
- "sphinx<8",
42
- ]
43
-
44
35
  [project.urls]
45
36
  Homepage = "https://github.com/experimaestro/datamaestro_text"
46
37
  Documentation = "https://datamaestro-text.readthedocs.io/en/latest/"
@@ -79,9 +70,36 @@ norecursedirs = ["node_modules"]
79
70
  [dependency-groups]
80
71
  dev = [
81
72
  "docutils>=0.21.2",
82
- "flake8>=7.3.0",
83
73
  "git-cliff>=2.11.0",
84
74
  "pytest>=8.4.1",
75
+ "ruff>=0.8",
85
76
  "sphinx>=7,<8",
86
77
  "sphobjinv>=2.3.1.3",
87
78
  ]
79
+ docs = [
80
+ "datamaestro>=0.8.5",
81
+ "experimaestro>=2.0.0b29", # Pre-release needed for EnumType.name() fix
82
+ "myst-parser>=0.18.0",
83
+ "sphinx>=6,<8", # experimaestro.sphinx uses restify removed in Sphinx 8
84
+ "sphinx-codeautolink>=0.15.0",
85
+ "sphinx-rtd-theme>=3.1.0",
86
+ "sphinx-toolbox>=4.1.2",
87
+ ]
88
+
89
+ [tool.ruff]
90
+ line-length = 88
91
+ target-version = "py310"
92
+
93
+ [tool.ruff.lint]
94
+ select = [
95
+ "E", # pycodestyle errors
96
+ "F", # pyflakes
97
+ "W", # pycodestyle warnings
98
+ "C90", # mccabe complexity
99
+ ]
100
+ ignore = [
101
+ "E501", # line too long (handled by formatter)
102
+ ]
103
+
104
+ [tool.ruff.lint.mccabe]
105
+ max-complexity = 20
@@ -1,6 +1,6 @@
1
1
  import datamaestro
2
2
 
3
- from .version import version, version_tuple
3
+ from .version import version as version, version_tuple as version_tuple
4
4
 
5
5
 
6
6
  class Repository(datamaestro.Repository):
@@ -1,6 +1,5 @@
1
1
  # See documentation on https://datamaestro.readthedocs.io
2
2
 
3
- import bz2
4
3
  from datamaestro.download import reference
5
4
  from datamaestro.definitions import datatasks, datatags, dataset
6
5
  from datamaestro_text.data.conversation.base import ConversationUserTopics
@@ -1,4 +1,4 @@
1
- from datamaestro.definitions import datatasks, datatags, dataset
1
+ from datamaestro.definitions import dataset
2
2
  from datamaestro.download.single import filedownloader
3
3
  from datamaestro_text.data.text import TextFile
4
4
  from datamaestro.utils import HashCheck
@@ -1,5 +1,5 @@
1
1
  from datamaestro.data.ml import FolderBased, Supervised
2
- from datamaestro.definitions import datatasks, datatags, dataset
2
+ from datamaestro.definitions import dataset
3
3
  from datamaestro.download.archive import tardownloader
4
4
 
5
5
 
@@ -5,7 +5,6 @@ GloVe is an unsupervised learning algorithm for obtaining vector representations
5
5
  """
6
6
 
7
7
  from datamaestro.definitions import dataset
8
- from datamaestro.data import Base, Generic
9
8
  from datamaestro.download import reference
10
9
  from datamaestro.download.archive import zipdownloader
11
10
  from datamaestro.download.single import filedownloader
@@ -1,5 +1,4 @@
1
- """CORD-19 dataset
2
- """
1
+ """CORD-19 dataset"""
3
2
 
4
3
  from datamaestro.annotations.agreement import useragreement
5
4
  from datamaestro.definitions import datatasks, dataset
@@ -1,4 +1,4 @@
1
- from datamaestro.data import Base, File
1
+ from datamaestro.data import File
2
2
  from datamaestro.definitions import (
3
3
  datatasks,
4
4
  datatags,
@@ -0,0 +1,8 @@
1
+ from .base import (
2
+ AnswerEntry as AnswerEntry,
3
+ ConversationDataset as ConversationDataset,
4
+ ConversationHistory as ConversationHistory,
5
+ ConversationHistoryItem as ConversationHistoryItem,
6
+ DecontextualizedItem as DecontextualizedItem,
7
+ EntryType as EntryType,
8
+ )
@@ -2,13 +2,13 @@ from abc import ABC, abstractmethod
2
2
  from enum import Enum
3
3
  from datamaestro_text.data.ir.base import IDItem, SimpleTextItem
4
4
  from experimaestro import Param
5
- from typing import Dict, Generic, Iterator, List, Optional, Sequence, Tuple
5
+ from typing import Dict, Iterator, List, Optional, Sequence, Tuple
6
6
  from attr import define
7
7
  from datamaestro.record import record_type
8
8
  from datamaestro.data import Base
9
9
  from datamaestro.record import Record, Item
10
10
  from datamaestro_text.data.ir import TopicRecord, Topics
11
- from datamaestro_text.utils.iter import FactoryIterable, LazyList, RangeView
11
+ from datamaestro_text.utils.iter import FactoryIterable, LazyList
12
12
 
13
13
  # ---- Basic types
14
14
 
@@ -11,7 +11,6 @@ from datamaestro_text.data.conversation.base import (
11
11
  EntryType,
12
12
  )
13
13
  from datamaestro_text.data.ir import IDItem, SimpleTextItem
14
- import logging
15
14
 
16
15
 
17
16
  @define(kw_only=True)
@@ -82,9 +81,9 @@ class CanardDataset(ConversationDataset, File):
82
81
  )
83
82
  else:
84
83
  # The utterance before the last is the last user query
85
- assert (
86
- entry.history[-2] == history[-1][SimpleTextItem].text
87
- ), f"{entry.dialogue_id} {entry.history} / {history[-4:-1]}"
84
+ assert entry.history[-2] == history[-1][SimpleTextItem].text, (
85
+ f"{entry.dialogue_id} {entry.history} / {history[-4:-1]}"
86
+ )
88
87
 
89
88
  # The last utterance is the system side
90
89
  history.append(
@@ -5,7 +5,6 @@ import logging
5
5
  from datamaestro.data import File
6
6
  from datamaestro.record import Record
7
7
 
8
- from datamaestro_text.data.ir import Topics
9
8
  from datamaestro_text.data.ir.base import (
10
9
  IDItem,
11
10
  SimpleTextItem,
@@ -113,9 +113,9 @@ class OrConvQADataset(ConversationDataset, File):
113
113
  if relevance > 0:
114
114
  relevances[rank] = (entry.answer.answer_start, None)
115
115
 
116
- assert (
117
- len(relevances) <= 1
118
- ), f"Too many relevance labels ({len(relevances)}) for {entry.query_id}"
116
+ assert len(relevances) <= 1, (
117
+ f"Too many relevance labels ({len(relevances)}) for {entry.query_id}"
118
+ )
119
119
 
120
120
  history.append(
121
121
  Record(
@@ -20,6 +20,7 @@ class WordEmbeddings(Base):
20
20
 
21
21
  class WordEmbeddingsText(WordEmbeddings, File):
22
22
  """Word embeddings as a text word / values"""
23
+
23
24
  encoding: Meta[str] = "utf-8"
24
25
 
25
26
  def load(self):
@@ -6,7 +6,7 @@ from functools import cached_property
6
6
  import logging
7
7
  from pathlib import Path
8
8
  from attrs import define
9
- from typing import Callable, Dict, Iterator, List, Optional, Tuple, Type, TYPE_CHECKING
9
+ from typing import Callable, Dict, Iterator, List, Optional, Tuple, Type
10
10
  import random
11
11
  from experimaestro import Config
12
12
  from datamaestro.definitions import datatasks, Param, Meta
@@ -1,7 +1,7 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from attrs import define
3
3
  from typing import List
4
- from datamaestro.record import Record, Item, record_type
4
+ from datamaestro.record import Record, Item
5
5
 
6
6
 
7
7
  TopicRecord = DocumentRecord = Record
@@ -0,0 +1 @@
1
+ from .base import * # noqa: F403
@@ -1,5 +1,5 @@
1
1
  from functools import cached_property
2
- from typing import ClassVar, Tuple, List
2
+ from typing import Tuple, List
3
3
  from attrs import define
4
4
  from datamaestro.record import record_type
5
5
  from ir_datasets.datasets.wapo import WapoDocMedia
@@ -10,6 +10,7 @@ from ir_datasets.datasets.cord19 import Cord19FullTextSection
10
10
  @define
11
11
  class DocumentWithTitle(TextItem):
12
12
  """Web document with title and body"""
13
+
13
14
  title: str
14
15
  body: str
15
16
 
@@ -82,7 +82,7 @@ class IKatClueWeb22DocumentStore(LZ4DocumentStore):
82
82
 
83
83
  file_checksum = hasher.hexdigest()
84
84
  assert file_checksum == checksum, (
85
- f"Expected {checksum}, " f"got {file_checksum} for {filename}"
85
+ f"Expected {checksum}, got {file_checksum} for {filename}"
86
86
  )
87
87
 
88
88
  # Get the MD5 hashes of all the passages
@@ -14,6 +14,7 @@ class TrainingText(Supervised):
14
14
 
15
15
  class TextFolder(Folder):
16
16
  "A folder composed of texts"
17
+
17
18
  pass
18
19
 
19
20
 
@@ -0,0 +1 @@
1
+ # IR datasets integration package
@@ -1,6 +1,5 @@
1
1
  import logging
2
2
  from abc import ABC, abstractmethod
3
- from dataclasses import dataclass
4
3
  from functools import cached_property, partial
5
4
  from pathlib import Path
6
5
  from typing import Dict, Iterator, List, NamedTuple, Tuple, Type
@@ -613,11 +612,7 @@ class Cast2022TopicsHandler(CastTopicsHandler):
613
612
  records = []
614
613
  nodes: Dict[str, ConversationTreeNode] = {}
615
614
 
616
- for (
617
- query
618
- ) in (
619
- self.dataset.dataset.queries_iter()
620
- ): # type: _irds.trec_cast.Cast2022Query
615
+ for query in self.dataset.dataset.queries_iter(): # type: _irds.trec_cast.Cast2022Query
621
616
  parent = nodes[query.parent_id] if query.parent_id else None
622
617
 
623
618
  if query.participant == "User":
@@ -10,7 +10,6 @@ from collections import namedtuple
10
10
 
11
11
  from datamaestro.download import Download
12
12
  from datamaestro.definitions import AbstractDataset
13
- from datamaestro.utils import TemporaryDirectory
14
13
 
15
14
  APIKEY_KEY = "org.themoviedb.apikey"
16
15
 
@@ -2,7 +2,6 @@ import logging
2
2
  import gzip
3
3
  from abc import ABC, abstractmethod
4
4
  from pathlib import Path
5
- from typing import Type
6
5
  from experimaestro import Config, Task, Param, Annotated, pathgenerator, Option, tqdm
7
6
  import numpy as np
8
7
  from datamaestro.record import RecordType
@@ -131,22 +130,22 @@ class ShuffledTrainingTripletsLines(Task):
131
130
 
132
131
  def __validate__(self):
133
132
  if self.topic_ids:
134
- assert self.data.topic_recordtype.has(
135
- ir.IDItem
136
- ), f"No topic ID in the source data ({self.data.topic_recordtype})"
133
+ assert self.data.topic_recordtype.has(ir.IDItem), (
134
+ f"No topic ID in the source data ({self.data.topic_recordtype})"
135
+ )
137
136
  else:
138
- assert self.data.topic_recordtype.has(
139
- ir.TextItem
140
- ), f"No topic text in the source data ({self.data.topic_recordtype})"
137
+ assert self.data.topic_recordtype.has(ir.TextItem), (
138
+ f"No topic text in the source data ({self.data.topic_recordtype})"
139
+ )
141
140
 
142
141
  if self.doc_ids:
143
- assert self.data.document_recordtype.has(
144
- ir.IDItem
145
- ), "No doc ID in the source data"
142
+ assert self.data.document_recordtype.has(ir.IDItem), (
143
+ "No doc ID in the source data"
144
+ )
146
145
  else:
147
- assert self.data.document_recordtype.has(
148
- ir.TextItem
149
- ), "No doc text in the source data"
146
+ assert self.data.document_recordtype.has(ir.TextItem), (
147
+ "No doc text in the source data"
148
+ )
150
149
 
151
150
  def task_outputs(self, dep):
152
151
  return dep(
@@ -50,7 +50,7 @@ def shuffle(
50
50
  *,
51
51
  memory=MEMORY,
52
52
  random=None,
53
- tmp_path: Optional[Path] = None
53
+ tmp_path: Optional[Path] = None,
54
54
  ):
55
55
  """Shuffle using temporary file"""
56
56
  if random is None:
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '2026.1.1'
32
- __version_tuple__ = version_tuple = (2026, 1, 1)
31
+ __version__ = version = '2026.2.2'
32
+ __version_tuple__ = version_tuple = (2026, 2, 2)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -1,8 +0,0 @@
1
- from .base import (
2
- AnswerEntry,
3
- ConversationDataset,
4
- ConversationHistory,
5
- ConversationHistoryItem,
6
- DecontextualizedItem,
7
- EntryType,
8
- )
@@ -1 +0,0 @@
1
- from .base import *