datamaestro-text 2025.7.28__tar.gz → 2026.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. {datamaestro_text-2025.7.28/src/datamaestro_text.egg-info → datamaestro_text-2026.1.1}/PKG-INFO +20 -16
  2. datamaestro_text-2026.1.1/pyproject.toml +87 -0
  3. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/com/github/aagohary/canard.py +3 -3
  4. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/com/github/apple/ml-qrecc.py +2 -1
  5. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py +1 -1
  6. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/com/oscar-corpus.py +1 -1
  7. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/com/sentiment140.py +2 -2
  8. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/com/smashwords/bookcorpus.py +1 -1
  9. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/edu/stanford/aclimdb.py +2 -2
  10. datamaestro_text-2026.1.1/src/datamaestro_text/config/fr/granddebat.py +186 -0
  11. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/io/github/thunlp/fewrel.py +1 -1
  12. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/io/metamind/research/wikitext.py +3 -3
  13. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/org/grouplens/movielens.py +8 -8
  14. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/org/universaldependencies/french.py +3 -3
  15. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/data/conversation/base.py +2 -2
  16. datamaestro_text-2026.1.1/src/datamaestro_text/data/debate/__init__.py +5 -0
  17. datamaestro_text-2026.1.1/src/datamaestro_text/data/debate/granddebat.py +68 -0
  18. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/data/ir/__init__.py +19 -2
  19. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/data/ir/csv.py +7 -8
  20. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/data/ir/formats.py +1 -3
  21. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/datasets/irds/data.py +24 -13
  22. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/datasets/irds/datasets.py +1 -1
  23. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/transforms/ir/__init__.py +1 -1
  24. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/version.py +16 -3
  25. datamaestro_text-2025.7.28/.circleci/config.yml +0 -53
  26. datamaestro_text-2025.7.28/.flake8 +0 -5
  27. datamaestro_text-2025.7.28/.github/workflows/pytest.yml +0 -38
  28. datamaestro_text-2025.7.28/.github/workflows/python-publish.yml +0 -30
  29. datamaestro_text-2025.7.28/.pre-commit-config.yaml +0 -19
  30. datamaestro_text-2025.7.28/.readthedocs.yml +0 -23
  31. datamaestro_text-2025.7.28/MANIFEST.in +0 -10
  32. datamaestro_text-2025.7.28/Makefile +0 -29
  33. datamaestro_text-2025.7.28/PKG-INFO +0 -41
  34. datamaestro_text-2025.7.28/docs/Makefile +0 -20
  35. datamaestro_text-2025.7.28/docs/make.bat +0 -35
  36. datamaestro_text-2025.7.28/docs/requirements.txt +0 -5
  37. datamaestro_text-2025.7.28/docs/source/api/conversation.rst +0 -77
  38. datamaestro_text-2025.7.28/docs/source/api/embeddings.rst +0 -6
  39. datamaestro_text-2025.7.28/docs/source/api/index.rst +0 -11
  40. datamaestro_text-2025.7.28/docs/source/api/ir.rst +0 -119
  41. datamaestro_text-2025.7.28/docs/source/api/nlp.rst +0 -4
  42. datamaestro_text-2025.7.28/docs/source/api/recommendation.rst +0 -6
  43. datamaestro_text-2025.7.28/docs/source/api/text.rst +0 -6
  44. datamaestro_text-2025.7.28/docs/source/conf.py +0 -83
  45. datamaestro_text-2025.7.28/docs/source/datasets/conversation.rst +0 -12
  46. datamaestro_text-2025.7.28/docs/source/datasets/embeddings.rst +0 -7
  47. datamaestro_text-2025.7.28/docs/source/datasets/index.rst +0 -12
  48. datamaestro_text-2025.7.28/docs/source/datasets/ir.rst +0 -13
  49. datamaestro_text-2025.7.28/docs/source/datasets/irds.rst +0 -20
  50. datamaestro_text-2025.7.28/docs/source/datasets/recommendation.rst +0 -7
  51. datamaestro_text-2025.7.28/docs/source/datasets/text.rst +0 -9
  52. datamaestro_text-2025.7.28/docs/source/index.rst +0 -30
  53. datamaestro_text-2025.7.28/mkdocs.yml +0 -28
  54. datamaestro_text-2025.7.28/pyproject.toml +0 -57
  55. datamaestro_text-2025.7.28/requirements-dev.txt +0 -2
  56. datamaestro_text-2025.7.28/requirements.txt +0 -3
  57. datamaestro_text-2025.7.28/setup.cfg +0 -4
  58. datamaestro_text-2025.7.28/src/datamaestro_text.egg-info/SOURCES.txt +0 -123
  59. datamaestro_text-2025.7.28/src/datamaestro_text.egg-info/dependency_links.txt +0 -1
  60. datamaestro_text-2025.7.28/src/datamaestro_text.egg-info/entry_points.txt +0 -3
  61. datamaestro_text-2025.7.28/src/datamaestro_text.egg-info/requires.txt +0 -10
  62. datamaestro_text-2025.7.28/src/datamaestro_text.egg-info/top_level.txt +0 -1
  63. datamaestro_text-2025.7.28/tox.ini +0 -13
  64. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/.gitignore +0 -0
  65. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/LICENSE +0 -0
  66. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/README.md +0 -0
  67. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/__init__.py +0 -0
  68. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/__init__.py +0 -0
  69. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/ai/quac.yaml +0 -0
  70. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml +0 -0
  71. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/com/github/ikat.py +0 -0
  72. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/com/github/prdwb/orconvqa.py +0 -0
  73. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/com/github/soskek/bookcorpus.yaml +0 -0
  74. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/com/microsoft/wikiqa.yaml +0 -0
  75. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/edu/cornell/nlvr.yaml +0 -0
  76. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/edu/stanford/__init__.py +0 -0
  77. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/edu/stanford/glove.py +0 -0
  78. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/edu/stanford/im2p.yaml +0 -0
  79. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/edu/upenn/__init__.py +0 -0
  80. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/edu/upenn/ldc/__init__.py +0 -0
  81. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py +0 -0
  82. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/gov/__init__.py +0 -0
  83. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/gov/nist/__init__.py +0 -0
  84. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/gov/nist/ir/covid.py +0 -0
  85. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/gov/nist/trec/__init__.py +0 -0
  86. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/gov/nist/trec/adhoc.py +0 -0
  87. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/gov/nist/trec/clueweb.yaml +0 -0
  88. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/gov/nist/trec/deeplearning.yaml +0 -0
  89. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/gov/nist/trec/index.yaml +0 -0
  90. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/gov/nist/trec/tipster.py +0 -0
  91. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/gov/nist/trec/web.yaml +0 -0
  92. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/io/github/rajpurkar/squad.yaml +0 -0
  93. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/io/metamind/research/__init__.py +0 -0
  94. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/net/mattmahoney/enwiki.yaml +0 -0
  95. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/org/acm/recsys/cb2014.yaml +0 -0
  96. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/org/cocodataset/index.yaml +0 -0
  97. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml +0 -0
  98. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/data/__init__.py +0 -0
  99. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/data/conversation/__init__.py +0 -0
  100. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/data/conversation/canard.py +0 -0
  101. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/data/conversation/ikat.py +0 -0
  102. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/data/conversation/orconvqa.py +0 -0
  103. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/data/conversation/qrecc.py +0 -0
  104. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/data/embeddings.py +0 -0
  105. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/data/ir/base.py +0 -0
  106. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/data/ir/cord19.py +0 -0
  107. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/data/ir/data.py +0 -0
  108. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/data/ir/huggingface.py +0 -0
  109. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/data/ir/stores.py +0 -0
  110. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/data/ir/trec.py +0 -0
  111. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/data/ir/utils.py +0 -0
  112. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/data/recommendation.py +0 -0
  113. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/data/tagging.py +0 -0
  114. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/data/text.py +0 -0
  115. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/datasets/irds/__init__.py +0 -0
  116. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/datasets/irds/helpers.py +0 -0
  117. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/datasets/irds/utils.py +0 -0
  118. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/download/tmdb.py +0 -0
  119. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/interfaces/plaintext.py +0 -0
  120. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/interfaces/trec.py +0 -0
  121. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/test/__init__.py +0 -0
  122. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/test/test_datasets.py +0 -0
  123. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/test/test_documented.py +0 -0
  124. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/transforms/__init__.py +0 -0
  125. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/utils/__init__.py +0 -0
  126. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/utils/files.py +0 -0
  127. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/utils/iter.py +0 -0
  128. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/utils/randomstream.py +0 -0
  129. {datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/utils/shuffle.py +0 -0
@@ -1,33 +1,37 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datamaestro-text
3
- Version: 2025.7.28
3
+ Version: 2026.1.1
4
4
  Summary: Datamaestro module for text-related datasets
5
+ Project-URL: Homepage, https://github.com/experimaestro/datamaestro_text
6
+ Project-URL: Documentation, https://datamaestro-text.readthedocs.io/en/latest/
7
+ Project-URL: Repository, https://github.com/experimaestro/datamaestro_text
8
+ Project-URL: Bug Tracker, https://github.com/experimaestro/datamaestro_text/issues
5
9
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
- License: GPL-3
7
- Project-URL: homepage, https://github.com/experimaestro/datamaestro_text
8
- Project-URL: documentation, https://datamaestro-text.readthedocs.io/en/latest/
9
- Project-URL: repository, https://github.com/experimaestro/datamaestro_text
10
- Keywords: dataset manager,information retrieval,experiments
10
+ License: GPL-3.0-or-later
11
+ License-File: LICENSE
12
+ Keywords: dataset manager,experiments,information retrieval
11
13
  Classifier: Development Status :: 4 - Beta
12
14
  Classifier: Intended Audience :: Science/Research
13
15
  Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
14
16
  Classifier: Operating System :: OS Independent
15
17
  Classifier: Programming Language :: Python
16
18
  Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
17
22
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
23
  Requires-Python: >=3.10
19
- Description-Content-Type: text/markdown
20
- License-File: LICENSE
21
- Requires-Dist: datamaestro>=1.5.0
22
- Requires-Dist: ir_datasets>=0.5.8
23
24
  Requires-Dist: attrs
25
+ Requires-Dist: datamaestro>=1.6.2
26
+ Requires-Dist: experimaestro
27
+ Requires-Dist: ir-datasets>=0.5.8
24
28
  Provides-Extra: dev
25
- Requires-Dist: pytest; extra == "dev"
26
- Requires-Dist: docutils; extra == "dev"
27
- Requires-Dist: sphobjinv; extra == "dev"
28
- Requires-Dist: flake8; extra == "dev"
29
- Requires-Dist: sphinx; extra == "dev"
30
- Dynamic: license-file
29
+ Requires-Dist: docutils; extra == 'dev'
30
+ Requires-Dist: flake8; extra == 'dev'
31
+ Requires-Dist: pytest; extra == 'dev'
32
+ Requires-Dist: sphinx<8; extra == 'dev'
33
+ Requires-Dist: sphobjinv; extra == 'dev'
34
+ Description-Content-Type: text/markdown
31
35
 
32
36
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![PyPI version](https://badge.fury.io/py/datamaestro-text.svg)](https://badge.fury.io/py/datamaestro-text)
33
37
 
@@ -0,0 +1,87 @@
1
+ [build-system]
2
+ requires = ["hatchling", "hatch-vcs"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "datamaestro-text"
7
+ authors = [
8
+ {name = "Benjamin Piwowarski", email = "benjamin@piwowarski.fr"}
9
+ ]
10
+ description = "Datamaestro module for text-related datasets"
11
+ readme = "README.md"
12
+ license = {text = "GPL-3.0-or-later"}
13
+ keywords = ["dataset manager", "information retrieval", "experiments"]
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "Intended Audience :: Science/Research",
17
+ "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
18
+ "Operating System :: OS Independent",
19
+ "Programming Language :: Python",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Topic :: Software Development :: Libraries :: Python Modules",
25
+ ]
26
+ requires-python = ">=3.10"
27
+ dynamic = ["version"]
28
+ dependencies = [
29
+ "datamaestro>=1.6.2",
30
+ "ir_datasets>=0.5.8",
31
+ "attrs",
32
+ "experimaestro",
33
+ ]
34
+
35
+ [project.optional-dependencies]
36
+ dev = [
37
+ "pytest",
38
+ "docutils",
39
+ "sphobjinv",
40
+ "flake8",
41
+ "sphinx<8",
42
+ ]
43
+
44
+ [project.urls]
45
+ Homepage = "https://github.com/experimaestro/datamaestro_text"
46
+ Documentation = "https://datamaestro-text.readthedocs.io/en/latest/"
47
+ Repository = "https://github.com/experimaestro/datamaestro_text"
48
+ "Bug Tracker" = "https://github.com/experimaestro/datamaestro_text/issues"
49
+
50
+ [project.entry-points."datamaestro.repositories"]
51
+ text = "datamaestro_text:Repository"
52
+ irds = "datamaestro_text.datasets.irds:Repository"
53
+
54
+ [tool.hatch.version]
55
+ source = "vcs"
56
+
57
+ [tool.hatch.version.raw-options]
58
+ local_scheme = "no-local-version"
59
+
60
+ [tool.hatch.build.hooks.vcs]
61
+ version-file = "src/datamaestro_text/version.py"
62
+
63
+ [tool.hatch.build.targets.sdist]
64
+ include = [
65
+ "/src",
66
+ "/README.md",
67
+ "/LICENSE",
68
+ "/pyproject.toml",
69
+ ]
70
+
71
+ [tool.hatch.build.targets.wheel]
72
+ packages = ["src/datamaestro_text"]
73
+
74
+ [tool.pytest.ini_options]
75
+ junit_family = "xunit2"
76
+ testpaths = ["src/datamaestro_text"]
77
+ norecursedirs = ["node_modules"]
78
+
79
+ [dependency-groups]
80
+ dev = [
81
+ "docutils>=0.21.2",
82
+ "flake8>=7.3.0",
83
+ "git-cliff>=2.11.0",
84
+ "pytest>=8.4.1",
85
+ "sphinx>=7,<8",
86
+ "sphobjinv>=2.3.1.3",
87
+ ]
@@ -37,7 +37,7 @@ def main(train, dev, test):
37
37
  Each dataset is an instance of :class:`datamaestro_text.data.conversation.CanardDataset`
38
38
  """
39
39
  return {
40
- "train": CanardDataset(path=train),
41
- "validation": CanardDataset(path=dev),
42
- "test": CanardDataset(path=test),
40
+ "train": CanardDataset.C(path=train),
41
+ "validation": CanardDataset.C(path=dev),
42
+ "test": CanardDataset.C(path=test),
43
43
  }
@@ -51,6 +51,7 @@ def main(data: Path) -> Supervised[QReCCDataset, None, QReCCDataset]:
51
51
  )
52
52
  class Content(LZ4JSONLDocumentStore):
53
53
  """QReCC mentionned URLs content"""
54
+
54
55
  @staticmethod
55
56
  def __create_dataset__(dataset, options=None):
56
57
  ds = reference(reference=main).setup(dataset, options)
@@ -65,7 +66,7 @@ class Content(LZ4JSONLDocumentStore):
65
66
  "id",
66
67
  ).setup(dataset, options)
67
68
 
68
- return Content(jsonl_path=store_path)
69
+ return Content.C(jsonl_path=store_path)
69
70
 
70
71
  @staticmethod
71
72
  def _documents(path: Path):
@@ -47,7 +47,7 @@ http://www.msmarco.org/dataset.aspx""",
47
47
  @dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
48
48
  def collection_etc(data) -> Folder:
49
49
  """Documents and some more files"""
50
- return Folder(path=data)
50
+ return Folder.C(path=data)
51
51
 
52
52
 
53
53
  @lua
@@ -1,4 +1,4 @@
1
- from datamaestro.definitions import argument, datatasks, datatags, dataset
1
+ from datamaestro.definitions import datatasks, datatags, dataset
2
2
  from datamaestro.download.single import filedownloader
3
3
  from datamaestro_text.data.text import TextFile
4
4
  from datamaestro.utils import HashCheck
@@ -27,6 +27,6 @@ def english(dir):
27
27
  If you use this data, please cite Sentiment140 as your source.
28
28
  """
29
29
  return Supervised.C(
30
- train=Generic(path=dir / "training.1600000.processed.noemoticon.csv"),
31
- test=Generic(path=dir / "testdata.manual.2009.06.14.csv"),
30
+ train=Generic.C(path=dir / "training.1600000.processed.noemoticon.csv"),
31
+ test=Generic.C(path=dir / "testdata.manual.2009.06.14.csv"),
32
32
  )
@@ -1,6 +1,6 @@
1
1
  # See documentation on https://datamaestro.readthedocs.io
2
2
 
3
- from datamaestro.definitions import argument, datatasks, datatags, dataset
3
+ from datamaestro.definitions import datatasks, datatags, dataset
4
4
  from datamaestro_text.data.text import TextFolder
5
5
  from datamaestro.download.archive import tardownloader
6
6
  from datamaestro.utils import HashCheck
@@ -11,6 +11,6 @@ def aclimdb(data):
11
11
  Paper http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf
12
12
  """
13
13
  return {
14
- "train": FolderBased(path=data / "train", classes=["neg", "pos"]),
15
- "test": FolderBased(path=data / "test", classes=["neg", "pos"]),
14
+ "train": FolderBased.C(path=data / "train", classes=["neg", "pos"]),
15
+ "test": FolderBased.C(path=data / "test", classes=["neg", "pos"]),
16
16
  }
@@ -0,0 +1,186 @@
1
+ # See documentation on https://datamaestro.readthedocs.io
2
+
3
+ from pathlib import Path
4
+ from datamaestro.definitions import datatags, dataset
5
+ from datamaestro_text.data.debate import GrandDebatFile
6
+ from datamaestro.download.single import filedownloader
7
+ from datamaestro.utils import HashCheck
8
+ from datamaestro.stream import Transform
9
+ import io
10
+ import json
11
+ import ijson
12
+ import os
13
+ import threading
14
+
15
+
16
+ class JsonToJsonl(Transform):
17
+ """Transforms a JSON file with an array into a JSONL file with one line per
18
+ array element"""
19
+
20
+ def __call__(self, fileobj: io.IOBase) -> io.IOBase:
21
+ # Stream items from the top-level array into a read-end pipe.
22
+ try:
23
+ fileobj.seek(0)
24
+ except Exception:
25
+ pass
26
+
27
+ r_fd, w_fd = os.pipe()
28
+ r_file = os.fdopen(r_fd, "rb")
29
+ w_file = os.fdopen(w_fd, "wb")
30
+
31
+ def _writer(fin, fout):
32
+ try:
33
+ for item in ijson.items(fin, "item"):
34
+ line = json.dumps(item, ensure_ascii=False) + "\n"
35
+ fout.write(line.encode("utf-8"))
36
+ fout.close()
37
+ except Exception:
38
+ try:
39
+ fout.close()
40
+ except Exception:
41
+ pass
42
+
43
+ t = threading.Thread(target=_writer, args=(fileobj, w_file), daemon=True)
44
+ t.start()
45
+
46
+ return r_file
47
+
48
+
49
+ @filedownloader(
50
+ "la_transition_ecologique_2019_03_21.jsonl",
51
+ "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LA_TRANSITION_ECOLOGIQUE.json",
52
+ checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
53
+ transforms=JsonToJsonl(),
54
+ )
55
+ @datatags("politics", "debate", "french")
56
+ @dataset(
57
+ GrandDebatFile,
58
+ url="https://granddebat.fr",
59
+ )
60
+ def transition(la_transition_ecologique_2019_03_21: Path):
61
+ """Grand Débat National (transition écologique)
62
+
63
+ The *Grand Débat National* (GDN) is a country-wide citizen consultation held
64
+ in France in 2019.
65
+
66
+
67
+ The consultation prompted citizens to express their views across four main
68
+ themes: *Taxation and public spending*, *Organization of the state and
69
+ public services*, *Democracy and citizenship*, and *Ecological transition*.
70
+ A significant portion of this consultation involved online questionnaires,
71
+ each concluding with a critical open-ended prompt: "Do you have anything to
72
+ add about [theme]?".
73
+ """
74
+ return GrandDebatFile.C(path=la_transition_ecologique_2019_03_21)
75
+
76
+
77
+ @filedownloader(
78
+ "fiscalité_et_dépenses_publiques_2019_03_21.jsonl",
79
+ "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json",
80
+ checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
81
+ transforms=JsonToJsonl(),
82
+ )
83
+ @datatags("politics", "debate", "french")
84
+ @dataset(
85
+ GrandDebatFile,
86
+ url="https://granddebat.fr",
87
+ )
88
+ def fiscalité(fiscalité_et_dépenses_publiques_2019_03_21: Path):
89
+ """Grand Débat National (fiscalité et dépenses publiques)
90
+
91
+ The *Grand Débat National* (GDN) is a country-wide citizen consultation held
92
+ in France in 2019.
93
+
94
+
95
+ The consultation prompted citizens to express their views across four main
96
+ themes: *Taxation and public spending*, *Organization of the state and
97
+ public services*, *Democracy and citizenship*, and *Ecological transition*.
98
+ A significant portion of this consultation involved online questionnaires,
99
+ each concluding with a critical open-ended prompt: "Do you have anything to
100
+ add about [theme]?".
101
+ """
102
+ return GrandDebatFile.C(path=fiscalité_et_dépenses_publiques_2019_03_21)
103
+
104
+
105
+ @filedownloader(
106
+ "democratie_et_citoyennete_2019_03_21.jsonl",
107
+ "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/DEMOCRATIE_ET_CITOYENNETE.json",
108
+ checker=HashCheck("049aaeca7e51747f2da5d68143c760fa"),
109
+ transforms=JsonToJsonl(),
110
+ )
111
+ @datatags("politics", "debate", "french")
112
+ @dataset(
113
+ GrandDebatFile,
114
+ url="https://granddebat.fr",
115
+ )
116
+ def démocratie(democratie_et_citoyennete_2019_03_21: Path):
117
+ """Grand Débat National (démocratie et citoyenneté)
118
+
119
+ The *Grand Débat National* (GDN) is a country-wide citizen consultation held
120
+ in France in 2019.
121
+
122
+
123
+ The consultation prompted citizens to express their views across four main
124
+ themes: *Taxation and public spending*, *Organization of the state and
125
+ public services*, *Democracy and citizenship*, and *Ecological transition*.
126
+ A significant portion of this consultation involved online questionnaires,
127
+ each concluding with a critical open-ended prompt: "Do you have anything to
128
+ add about [theme]?".
129
+ """
130
+ return GrandDebatFile.C(path=democratie_et_citoyennete_2019_03_21)
131
+
132
+
133
+ @filedownloader(
134
+ "organisation_etat_services_publics_2019_03_21.jsonl",
135
+ "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json",
136
+ checker=HashCheck("0ccb7c401889f738b73b0caab897a68b"),
137
+ transforms=JsonToJsonl(),
138
+ )
139
+ @datatags("politics", "debate", "french")
140
+ @dataset(
141
+ GrandDebatFile,
142
+ url="https://granddebat.fr",
143
+ )
144
+ def organisation(organisation_etat_services_publics_2019_03_21: Path):
145
+ """Grand Débat National (organisation de l'État et des services publics)
146
+
147
+ The *Grand Débat National* (GDN) is a country-wide citizen consultation held
148
+ in France in 2019.
149
+
150
+
151
+ The consultation prompted citizens to express their views across four main
152
+ themes: *Taxation and public spending*, *Organization of the state and
153
+ public services*, *Democracy and citizenship*, and *Ecological transition*.
154
+ A significant portion of this consultation involved online questionnaires,
155
+ each concluding with a critical open-ended prompt: "Do you have anything to
156
+ add about [theme]?".
157
+ """
158
+ return GrandDebatFile.C(path=organisation_etat_services_publics_2019_03_21)
159
+
160
+
161
+ @filedownloader(
162
+ "les_evenements_2019_03_21.jsonl",
163
+ "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LES_EVENEMENTS.json",
164
+ checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
165
+ transforms=JsonToJsonl(),
166
+ )
167
+ @datatags("politics", "debate", "french")
168
+ @dataset(
169
+ GrandDebatFile,
170
+ url="https://granddebat.fr",
171
+ )
172
+ def evenements(les_evenements_2019_03_21: Path):
173
+ """Grand Débat National (événements)
174
+
175
+ The *Grand Débat National* (GDN) is a country-wide citizen consultation held
176
+ in France in 2019.
177
+
178
+
179
+ The consultation prompted citizens to express their views across four main
180
+ themes: *Taxation and public spending*, *Organization of the state and
181
+ public services*, *Democracy and citizenship*, and *Ecological transition*.
182
+ A significant portion of this consultation involved online questionnaires,
183
+ each concluding with a critical open-ended prompt: "Do you have anything to
184
+ add about [theme]?".
185
+ """
186
+ return GrandDebatFile.C(path=les_evenements_2019_03_21)
@@ -32,4 +32,4 @@ def v1(train, validation):
32
32
  Only the train and validation dataset are available. The test set is hidden
33
33
  for the leaderboard.
34
34
  """
35
- return {"train": File(path=train), "validation": File(path=validation)}
35
+ return {"train": File.C(path=train), "validation": File.C(path=validation)}
@@ -30,9 +30,9 @@ def WikiText(data, type):
30
30
  https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/
31
31
  """
32
32
  return {
33
- "train": File(path=data / ("wiki.train.%s" % type)),
34
- "validation": File(path=data / ("wiki.valid.%s" % type)),
35
- "test": File(path=data / ("wiki.test.%s" % type)),
33
+ "train": File.C(path=data / ("wiki.train.%s" % type)),
34
+ "validation": File.C(path=data / ("wiki.valid.%s" % type)),
35
+ "test": File.C(path=data / ("wiki.test.%s" % type)),
36
36
  }
37
37
 
38
38
 
@@ -31,10 +31,10 @@ def small(ds) -> Movielens:
31
31
  100,000 ratings and 3,600 tag applications applied to 9,000 movies by 600 users (as of 9/2018)
32
32
  """
33
33
  return {
34
- "ratings": csv.Generic(path=ds / "ratings.csv", names_row=0),
35
- "links": csv.Generic(path=ds / "links.csv", names_row=0),
36
- "movies": csv.Generic(path=ds / "movies.csv", names_row=0),
37
- "tags": csv.Generic(path=ds / "tags.csv", names_row=0),
34
+ "ratings": csv.Generic.C(path=ds / "ratings.csv", names_row=0),
35
+ "links": csv.Generic.C(path=ds / "links.csv", names_row=0),
36
+ "movies": csv.Generic.C(path=ds / "movies.csv", names_row=0),
37
+ "tags": csv.Generic.C(path=ds / "tags.csv", names_row=0),
38
38
  }
39
39
 
40
40
 
@@ -46,8 +46,8 @@ def full(ds) -> Movielens:
46
46
  27,000,000 ratings and 1,100,000 tag applications applied to 58,000 movies by 280,000 users (as of 9/2018)
47
47
  """
48
48
  return {
49
- "ratings": csv.Generic(path=ds / "ratings.csv", names_row=0),
50
- "links": csv.Generic(path=ds / "links.csv", names_row=0),
51
- "movies": csv.Generic(path=ds / "movies.csv", names_row=0),
52
- "tags": csv.Generic(path=ds / "tags.csv", names_row=0),
49
+ "ratings": csv.Generic.C(path=ds / "ratings.csv", names_row=0),
50
+ "links": csv.Generic.C(path=ds / "links.csv", names_row=0),
51
+ "movies": csv.Generic.C(path=ds / "movies.csv", names_row=0),
52
+ "tags": csv.Generic.C(path=ds / "tags.csv", names_row=0),
53
53
  }
@@ -34,9 +34,9 @@ def gsd(ds) -> Supervised:
34
34
  is updated since 2015 independently from the previous source.
35
35
  """
36
36
  return {
37
- "train": CoNLL_U(path=ds / "fr_gsd-ud-train.conllu"),
38
- "test": CoNLL_U(path=ds / "fr_gsd-ud-dev.conllu"),
39
- "validation": CoNLL_U(path=ds / "fr_gsd-ud-test.conllu"),
37
+ "train": CoNLL_U.C(path=ds / "fr_gsd-ud-train.conllu"),
38
+ "test": CoNLL_U.C(path=ds / "fr_gsd-ud-dev.conllu"),
39
+ "validation": CoNLL_U.C(path=ds / "fr_gsd-ud-test.conllu"),
40
40
  }
41
41
 
42
42
 
@@ -267,7 +267,7 @@ class ConversationUserTopics(Topics):
267
267
  """Returns an iterator over topics"""
268
268
  # Extracts topics from conversations, Each user query is a topic (can perform retrieval on it)
269
269
  # TODO: merge with xpmir.learning.DatasetConversationBase -> same logic
270
-
270
+
271
271
  records: List[TopicRecord] = []
272
272
  for conversation in self.conversations.__iter__():
273
273
  nodes = [
@@ -279,4 +279,4 @@ class ConversationUserTopics(Topics):
279
279
  records.append(
280
280
  node.entry.update(ConversationHistoryItem(node.history()))
281
281
  )
282
- return iter(records)
282
+ return iter(records)
@@ -0,0 +1,5 @@
1
+ """Data classes for debate datasets"""
2
+
3
+ from .granddebat import GrandDebatEntry, GrandDebatFile, GrandDebatResponse
4
+
5
+ __all__ = ["GrandDebatEntry", "GrandDebatFile", "GrandDebatResponse"]
@@ -0,0 +1,68 @@
1
+ """Data classes for the Grand Débat National dataset"""
2
+
3
+ import json
4
+ from dataclasses import dataclass, field
5
+ from typing import Iterator, List, Optional
6
+
7
+ from datamaestro.data import File
8
+
9
+
10
+ @dataclass
11
+ class GrandDebatResponse:
12
+ """A response to a question in the Grand Débat National"""
13
+
14
+ question_id: str
15
+ question_title: str
16
+ value: Optional[str]
17
+ formatted_value: Optional[str]
18
+
19
+
20
+ @dataclass
21
+ class GrandDebatEntry:
22
+ """An entry (contribution) in the Grand Débat National dataset"""
23
+
24
+ id: str
25
+ reference: str
26
+ title: str
27
+ created_at: str
28
+ published_at: str
29
+ updated_at: Optional[str]
30
+ trashed: bool
31
+ trashed_status: Optional[str]
32
+ author_id: str
33
+ author_type: str
34
+ author_zip_code: str
35
+ responses: List[GrandDebatResponse] = field(default_factory=list)
36
+
37
+
38
+ class GrandDebatFile(File):
39
+ """A Grand Débat National JSONL file with iteration support"""
40
+
41
+ def __iter__(self) -> Iterator[GrandDebatEntry]:
42
+ """Iterate over entries in the JSONL file"""
43
+ with self.path.open("r", encoding="utf-8") as f:
44
+ for line in f:
45
+ data = json.loads(line)
46
+ responses = [
47
+ GrandDebatResponse(
48
+ question_id=r["questionId"],
49
+ question_title=r["questionTitle"],
50
+ value=r.get("value"),
51
+ formatted_value=r.get("formattedValue"),
52
+ )
53
+ for r in data.get("responses", [])
54
+ ]
55
+ yield GrandDebatEntry(
56
+ id=data["id"],
57
+ reference=data["reference"],
58
+ title=data["title"],
59
+ created_at=data["createdAt"],
60
+ published_at=data["publishedAt"],
61
+ updated_at=data.get("updatedAt"),
62
+ trashed=data["trashed"],
63
+ trashed_status=data.get("trashedStatus"),
64
+ author_id=data["authorId"],
65
+ author_type=data["authorType"],
66
+ author_zip_code=data["authorZipCode"],
67
+ responses=responses,
68
+ )
@@ -1,6 +1,7 @@
1
1
  """Generic data types for information retrieval"""
2
2
 
3
3
  from abc import ABC, abstractmethod
4
+ from enum import Enum
4
5
  from functools import cached_property
5
6
  import logging
6
7
  from pathlib import Path
@@ -88,6 +89,19 @@ class Documents(Base):
88
89
  ...
89
90
 
90
91
 
92
+ class FileAccess(Enum):
93
+ """Defines how to access files (e.g. for document stores)"""
94
+
95
+ FILE = 0
96
+ """Direct file access"""
97
+
98
+ MMAP = 1
99
+ """Use mmap"""
100
+
101
+ MEMORY = 2
102
+ """Use memory"""
103
+
104
+
91
105
  class DocumentStore(Documents):
92
106
  """A document store
93
107
 
@@ -97,6 +111,10 @@ class DocumentStore(Documents):
97
111
  - return the number of documents
98
112
  """
99
113
 
114
+ file_access: Meta[FileAccess] = FileAccess.MMAP
115
+ """How to access the file collection (might not have any impact, depends on
116
+ the docstore)"""
117
+
100
118
  def docid_internal2external(self, docid: int):
101
119
  """Converts an internal collection ID (integer) to an external ID"""
102
120
  raise NotImplementedError(f"For class {self.__class__}")
@@ -327,5 +345,4 @@ class PairwiseSampleDataset(Base, ABC):
327
345
  """Datasets where each record is a query with positive and negative samples"""
328
346
 
329
347
  @abstractmethod
330
- def iter(self) -> Iterator[PairwiseSample]:
331
- ...
348
+ def iter(self) -> Iterator[PairwiseSample]: ...
@@ -1,27 +1,26 @@
1
1
  from functools import cached_property
2
2
  from pathlib import Path
3
- from typing import Iterator, Tuple, Type
4
3
 
5
- from experimaestro import Param, Option, Constant, Meta
6
- from datamaestro.definitions import argument
4
+ from experimaestro import Param, Meta
7
5
  from datamaestro.record import Record, RecordType
8
6
  import datamaestro_text.data.ir as ir
9
7
  from datamaestro_text.data.ir.base import IDItem, SimpleTextItem
10
8
  from datamaestro_text.interfaces.plaintext import read_tsv
11
9
 
12
10
 
13
- @argument("path", type=Path)
14
- @argument("separator", type=str, default="\t", ignored=True)
15
11
  class AdhocRunWithText(ir.AdhocRun):
16
12
  "(qid, doc.id, query, passage)"
17
- pass
13
+
14
+ path: Meta[Path]
15
+ separator: Meta[str] = "\t"
18
16
 
19
17
 
20
- @argument("path", type=Path)
21
- @argument("separator", type=str, default="\t", ignored=True)
22
18
  class Topics(ir.Topics):
23
19
  "Pairs of query id - query using a separator"
24
20
 
21
+ path: Meta[Path]
22
+ separator: Meta[str] = "\t"
23
+
25
24
  def iter(self):
26
25
  return (
27
26
  Record(IDItem(qid), SimpleTextItem(title))
@@ -10,10 +10,8 @@ from ir_datasets.datasets.cord19 import Cord19FullTextSection
10
10
  @define
11
11
  class DocumentWithTitle(TextItem):
12
12
  """Web document with title and body"""
13
-
14
- body: str
15
-
16
13
  title: str
14
+ body: str
17
15
 
18
16
  @cached_property
19
17
  def text(self):