datamaestro-text 2025.7.28__tar.gz → 2025.9.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamaestro_text-2025.7.28/src/datamaestro_text.egg-info → datamaestro_text-2025.9.11}/PKG-INFO +1 -1
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/github/aagohary/canard.py +3 -3
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/github/apple/ml-qrecc.py +2 -1
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py +1 -1
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/sentiment140.py +2 -2
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/edu/stanford/aclimdb.py +2 -2
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/io/github/thunlp/fewrel.py +1 -1
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/io/metamind/research/wikitext.py +3 -3
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/org/grouplens/movielens.py +8 -8
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/org/universaldependencies/french.py +3 -3
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/__init__.py +19 -2
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/datasets/irds/data.py +13 -1
- datamaestro_text-2025.9.11/src/datamaestro_text/version.py +34 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11/src/datamaestro_text.egg-info}/PKG-INFO +1 -1
- datamaestro_text-2025.7.28/src/datamaestro_text/version.py +0 -21
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/.circleci/config.yml +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/.flake8 +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/.github/workflows/pytest.yml +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/.github/workflows/python-publish.yml +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/.gitignore +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/.pre-commit-config.yaml +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/.readthedocs.yml +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/LICENSE +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/MANIFEST.in +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/Makefile +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/README.md +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/docs/Makefile +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/docs/make.bat +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/docs/requirements.txt +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/docs/source/api/conversation.rst +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/docs/source/api/embeddings.rst +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/docs/source/api/index.rst +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/docs/source/api/ir.rst +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/docs/source/api/nlp.rst +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/docs/source/api/recommendation.rst +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/docs/source/api/text.rst +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/docs/source/conf.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/docs/source/datasets/conversation.rst +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/docs/source/datasets/embeddings.rst +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/docs/source/datasets/index.rst +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/docs/source/datasets/ir.rst +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/docs/source/datasets/irds.rst +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/docs/source/datasets/recommendation.rst +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/docs/source/datasets/text.rst +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/docs/source/index.rst +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/mkdocs.yml +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/pyproject.toml +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/requirements-dev.txt +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/requirements.txt +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/setup.cfg +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/__init__.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/__init__.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/ai/quac.yaml +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/github/ikat.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/github/prdwb/orconvqa.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/github/soskek/bookcorpus.yaml +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/microsoft/wikiqa.yaml +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/oscar-corpus.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/smashwords/bookcorpus.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/edu/cornell/nlvr.yaml +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/edu/stanford/__init__.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/edu/stanford/glove.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/edu/stanford/im2p.yaml +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/edu/upenn/__init__.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/edu/upenn/ldc/__init__.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/__init__.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/__init__.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/ir/covid.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/trec/__init__.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/trec/adhoc.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/trec/clueweb.yaml +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/trec/deeplearning.yaml +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/trec/index.yaml +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/trec/tipster.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/trec/web.yaml +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/io/github/rajpurkar/squad.yaml +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/io/metamind/research/__init__.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/net/mattmahoney/enwiki.yaml +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/org/acm/recsys/cb2014.yaml +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/org/cocodataset/index.yaml +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/__init__.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/conversation/__init__.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/conversation/base.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/conversation/canard.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/conversation/ikat.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/conversation/orconvqa.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/conversation/qrecc.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/embeddings.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/base.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/cord19.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/csv.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/data.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/formats.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/huggingface.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/stores.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/trec.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/utils.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/recommendation.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/tagging.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/text.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/datasets/irds/__init__.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/datasets/irds/datasets.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/datasets/irds/helpers.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/datasets/irds/utils.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/download/tmdb.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/interfaces/plaintext.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/interfaces/trec.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/test/__init__.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/test/test_datasets.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/test/test_documented.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/transforms/__init__.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/transforms/ir/__init__.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/utils/__init__.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/utils/files.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/utils/iter.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/utils/randomstream.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/utils/shuffle.py +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text.egg-info/SOURCES.txt +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text.egg-info/dependency_links.txt +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text.egg-info/entry_points.txt +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text.egg-info/requires.txt +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text.egg-info/top_level.txt +0 -0
- {datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/tox.ini +0 -0
|
@@ -37,7 +37,7 @@ def main(train, dev, test):
|
|
|
37
37
|
Each dataset is an instance of :class:`datamaestro_text.data.conversation.CanardDataset`
|
|
38
38
|
"""
|
|
39
39
|
return {
|
|
40
|
-
"train": CanardDataset(path=train),
|
|
41
|
-
"validation": CanardDataset(path=dev),
|
|
42
|
-
"test": CanardDataset(path=test),
|
|
40
|
+
"train": CanardDataset.C(path=train),
|
|
41
|
+
"validation": CanardDataset.C(path=dev),
|
|
42
|
+
"test": CanardDataset.C(path=test),
|
|
43
43
|
}
|
|
@@ -51,6 +51,7 @@ def main(data: Path) -> Supervised[QReCCDataset, None, QReCCDataset]:
|
|
|
51
51
|
)
|
|
52
52
|
class Content(LZ4JSONLDocumentStore):
|
|
53
53
|
"""QReCC mentionned URLs content"""
|
|
54
|
+
|
|
54
55
|
@staticmethod
|
|
55
56
|
def __create_dataset__(dataset, options=None):
|
|
56
57
|
ds = reference(reference=main).setup(dataset, options)
|
|
@@ -65,7 +66,7 @@ class Content(LZ4JSONLDocumentStore):
|
|
|
65
66
|
"id",
|
|
66
67
|
).setup(dataset, options)
|
|
67
68
|
|
|
68
|
-
return Content(jsonl_path=store_path)
|
|
69
|
+
return Content.C(jsonl_path=store_path)
|
|
69
70
|
|
|
70
71
|
@staticmethod
|
|
71
72
|
def _documents(path: Path):
|
|
@@ -47,7 +47,7 @@ http://www.msmarco.org/dataset.aspx""",
|
|
|
47
47
|
@dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
|
|
48
48
|
def collection_etc(data) -> Folder:
|
|
49
49
|
"""Documents and some more files"""
|
|
50
|
-
return Folder(path=data)
|
|
50
|
+
return Folder.C(path=data)
|
|
51
51
|
|
|
52
52
|
|
|
53
53
|
@lua
|
|
@@ -27,6 +27,6 @@ def english(dir):
|
|
|
27
27
|
If you use this data, please cite Sentiment140 as your source.
|
|
28
28
|
"""
|
|
29
29
|
return Supervised.C(
|
|
30
|
-
train=Generic(path=dir / "training.1600000.processed.noemoticon.csv"),
|
|
31
|
-
test=Generic(path=dir / "testdata.manual.2009.06.14.csv"),
|
|
30
|
+
train=Generic.C(path=dir / "training.1600000.processed.noemoticon.csv"),
|
|
31
|
+
test=Generic.C(path=dir / "testdata.manual.2009.06.14.csv"),
|
|
32
32
|
)
|
|
@@ -11,6 +11,6 @@ def aclimdb(data):
|
|
|
11
11
|
Paper http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf
|
|
12
12
|
"""
|
|
13
13
|
return {
|
|
14
|
-
"train": FolderBased(path=data / "train", classes=["neg", "pos"]),
|
|
15
|
-
"test": FolderBased(path=data / "test", classes=["neg", "pos"]),
|
|
14
|
+
"train": FolderBased.C(path=data / "train", classes=["neg", "pos"]),
|
|
15
|
+
"test": FolderBased.C(path=data / "test", classes=["neg", "pos"]),
|
|
16
16
|
}
|
|
@@ -32,4 +32,4 @@ def v1(train, validation):
|
|
|
32
32
|
Only the train and validation dataset are available. The test set is hidden
|
|
33
33
|
for the leaderboard.
|
|
34
34
|
"""
|
|
35
|
-
return {"train": File(path=train), "validation": File(path=validation)}
|
|
35
|
+
return {"train": File.C(path=train), "validation": File.C(path=validation)}
|
|
@@ -30,9 +30,9 @@ def WikiText(data, type):
|
|
|
30
30
|
https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/
|
|
31
31
|
"""
|
|
32
32
|
return {
|
|
33
|
-
"train": File(path=data / ("wiki.train.%s" % type)),
|
|
34
|
-
"validation": File(path=data / ("wiki.valid.%s" % type)),
|
|
35
|
-
"test": File(path=data / ("wiki.test.%s" % type)),
|
|
33
|
+
"train": File.C(path=data / ("wiki.train.%s" % type)),
|
|
34
|
+
"validation": File.C(path=data / ("wiki.valid.%s" % type)),
|
|
35
|
+
"test": File.C(path=data / ("wiki.test.%s" % type)),
|
|
36
36
|
}
|
|
37
37
|
|
|
38
38
|
|
|
@@ -31,10 +31,10 @@ def small(ds) -> Movielens:
|
|
|
31
31
|
100,000 ratings and 3,600 tag applications applied to 9,000 movies by 600 users (as of 9/2018)
|
|
32
32
|
"""
|
|
33
33
|
return {
|
|
34
|
-
"ratings": csv.Generic(path=ds / "ratings.csv", names_row=0),
|
|
35
|
-
"links": csv.Generic(path=ds / "links.csv", names_row=0),
|
|
36
|
-
"movies": csv.Generic(path=ds / "movies.csv", names_row=0),
|
|
37
|
-
"tags": csv.Generic(path=ds / "tags.csv", names_row=0),
|
|
34
|
+
"ratings": csv.Generic.C(path=ds / "ratings.csv", names_row=0),
|
|
35
|
+
"links": csv.Generic.C(path=ds / "links.csv", names_row=0),
|
|
36
|
+
"movies": csv.Generic.C(path=ds / "movies.csv", names_row=0),
|
|
37
|
+
"tags": csv.Generic.C(path=ds / "tags.csv", names_row=0),
|
|
38
38
|
}
|
|
39
39
|
|
|
40
40
|
|
|
@@ -46,8 +46,8 @@ def full(ds) -> Movielens:
|
|
|
46
46
|
27,000,000 ratings and 1,100,000 tag applications applied to 58,000 movies by 280,000 users (as of 9/2018)
|
|
47
47
|
"""
|
|
48
48
|
return {
|
|
49
|
-
"ratings": csv.Generic(path=ds / "ratings.csv", names_row=0),
|
|
50
|
-
"links": csv.Generic(path=ds / "links.csv", names_row=0),
|
|
51
|
-
"movies": csv.Generic(path=ds / "movies.csv", names_row=0),
|
|
52
|
-
"tags": csv.Generic(path=ds / "tags.csv", names_row=0),
|
|
49
|
+
"ratings": csv.Generic.C(path=ds / "ratings.csv", names_row=0),
|
|
50
|
+
"links": csv.Generic.C(path=ds / "links.csv", names_row=0),
|
|
51
|
+
"movies": csv.Generic.C(path=ds / "movies.csv", names_row=0),
|
|
52
|
+
"tags": csv.Generic.C(path=ds / "tags.csv", names_row=0),
|
|
53
53
|
}
|
|
@@ -34,9 +34,9 @@ def gsd(ds) -> Supervised:
|
|
|
34
34
|
is updated since 2015 independently from the previous source.
|
|
35
35
|
"""
|
|
36
36
|
return {
|
|
37
|
-
"train": CoNLL_U(path=ds / "fr_gsd-ud-train.conllu"),
|
|
38
|
-
"test": CoNLL_U(path=ds / "fr_gsd-ud-dev.conllu"),
|
|
39
|
-
"validation": CoNLL_U(path=ds / "fr_gsd-ud-test.conllu"),
|
|
37
|
+
"train": CoNLL_U.C(path=ds / "fr_gsd-ud-train.conllu"),
|
|
38
|
+
"test": CoNLL_U.C(path=ds / "fr_gsd-ud-dev.conllu"),
|
|
39
|
+
"validation": CoNLL_U.C(path=ds / "fr_gsd-ud-test.conllu"),
|
|
40
40
|
}
|
|
41
41
|
|
|
42
42
|
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/__init__.py
RENAMED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Generic data types for information retrieval"""
|
|
2
2
|
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
|
+
from enum import Enum
|
|
4
5
|
from functools import cached_property
|
|
5
6
|
import logging
|
|
6
7
|
from pathlib import Path
|
|
@@ -88,6 +89,19 @@ class Documents(Base):
|
|
|
88
89
|
...
|
|
89
90
|
|
|
90
91
|
|
|
92
|
+
class FileAccess(Enum):
|
|
93
|
+
"""Defines how to access files (e.g. for document stores)"""
|
|
94
|
+
|
|
95
|
+
FILE = 0
|
|
96
|
+
"""Direct file access"""
|
|
97
|
+
|
|
98
|
+
MMAP = 1
|
|
99
|
+
"""Use mmap"""
|
|
100
|
+
|
|
101
|
+
MEMORY = 2
|
|
102
|
+
"""Use memory"""
|
|
103
|
+
|
|
104
|
+
|
|
91
105
|
class DocumentStore(Documents):
|
|
92
106
|
"""A document store
|
|
93
107
|
|
|
@@ -97,6 +111,10 @@ class DocumentStore(Documents):
|
|
|
97
111
|
- return the number of documents
|
|
98
112
|
"""
|
|
99
113
|
|
|
114
|
+
file_access: Meta[FileAccess] = FileAccess.MMAP
|
|
115
|
+
"""How to access the file collection (might not have any impact, depends on
|
|
116
|
+
the docstore)"""
|
|
117
|
+
|
|
100
118
|
def docid_internal2external(self, docid: int):
|
|
101
119
|
"""Converts an internal collection ID (integer) to an external ID"""
|
|
102
120
|
raise NotImplementedError(f"For class {self.__class__}")
|
|
@@ -327,5 +345,4 @@ class PairwiseSampleDataset(Base, ABC):
|
|
|
327
345
|
"""Datasets where each record is a query with positive and negative samples"""
|
|
328
346
|
|
|
329
347
|
@abstractmethod
|
|
330
|
-
def iter(self) -> Iterator[PairwiseSample]:
|
|
331
|
-
...
|
|
348
|
+
def iter(self) -> Iterator[PairwiseSample]: ...
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/datasets/irds/data.py
RENAMED
|
@@ -215,7 +215,19 @@ class Documents(ir.DocumentStore, IRDSId):
|
|
|
215
215
|
|
|
216
216
|
@cached_property
|
|
217
217
|
def store(self):
|
|
218
|
-
|
|
218
|
+
kwargs = {}
|
|
219
|
+
try:
|
|
220
|
+
# Translate to ir datasets docstore options
|
|
221
|
+
import ir_datasets.indices as ir_indices
|
|
222
|
+
file_access = {
|
|
223
|
+
ir.FileAccess.MMAP: ir_indices.FileAccess.MMAP,
|
|
224
|
+
ir.FileAccess.FILE: ir_indices.FileAccess.FILE,
|
|
225
|
+
ir.FileAccess.MEMORY: ir_indices.FileAccess.MEMORY
|
|
226
|
+
}[self.file_access]
|
|
227
|
+
kwargs = {"options": ir_indices.DocstoreOptions(file_access=file_access)}
|
|
228
|
+
except ImportError:
|
|
229
|
+
logging.warning("This version of ir-datasets cannot handle docstore options")
|
|
230
|
+
return self.dataset.docs_store(**kwargs)
|
|
219
231
|
|
|
220
232
|
@cached_property
|
|
221
233
|
def _docs(self):
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# file generated by setuptools-scm
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"__version__",
|
|
6
|
+
"__version_tuple__",
|
|
7
|
+
"version",
|
|
8
|
+
"version_tuple",
|
|
9
|
+
"__commit_id__",
|
|
10
|
+
"commit_id",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
TYPE_CHECKING = False
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from typing import Tuple
|
|
16
|
+
from typing import Union
|
|
17
|
+
|
|
18
|
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
19
|
+
COMMIT_ID = Union[str, None]
|
|
20
|
+
else:
|
|
21
|
+
VERSION_TUPLE = object
|
|
22
|
+
COMMIT_ID = object
|
|
23
|
+
|
|
24
|
+
version: str
|
|
25
|
+
__version__: str
|
|
26
|
+
__version_tuple__: VERSION_TUPLE
|
|
27
|
+
version_tuple: VERSION_TUPLE
|
|
28
|
+
commit_id: COMMIT_ID
|
|
29
|
+
__commit_id__: COMMIT_ID
|
|
30
|
+
|
|
31
|
+
__version__ = version = '2025.9.11'
|
|
32
|
+
__version_tuple__ = version_tuple = (2025, 9, 11)
|
|
33
|
+
|
|
34
|
+
__commit_id__ = commit_id = 'gadcc9bd27'
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
# file generated by setuptools-scm
|
|
2
|
-
# don't change, don't track in version control
|
|
3
|
-
|
|
4
|
-
__all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
|
|
5
|
-
|
|
6
|
-
TYPE_CHECKING = False
|
|
7
|
-
if TYPE_CHECKING:
|
|
8
|
-
from typing import Tuple
|
|
9
|
-
from typing import Union
|
|
10
|
-
|
|
11
|
-
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
12
|
-
else:
|
|
13
|
-
VERSION_TUPLE = object
|
|
14
|
-
|
|
15
|
-
version: str
|
|
16
|
-
__version__: str
|
|
17
|
-
__version_tuple__: VERSION_TUPLE
|
|
18
|
-
version_tuple: VERSION_TUPLE
|
|
19
|
-
|
|
20
|
-
__version__ = version = '2025.7.28'
|
|
21
|
-
__version_tuple__ = version_tuple = (2025, 7, 28)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/.github/workflows/python-publish.yml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/docs/source/api/recommendation.rst
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/docs/source/datasets/conversation.rst
RENAMED
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/docs/source/datasets/embeddings.rst
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/docs/source/datasets/recommendation.rst
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/__init__.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/ai/quac.yaml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/embeddings.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/base.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/cord19.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/csv.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/data.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/formats.py
RENAMED
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/stores.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/trec.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/tagging.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/download/tmdb.py
RENAMED
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/interfaces/trec.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/test/__init__.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/test/test_datasets.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/utils/__init__.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/utils/files.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/utils/iter.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/utils/randomstream.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text/utils/shuffle.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text.egg-info/SOURCES.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.7.28 → datamaestro_text-2025.9.11}/src/datamaestro_text.egg-info/requires.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|