datamaestro-text 2025.4.3__tar.gz → 2025.5.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/.flake8 +1 -1
  2. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/.github/workflows/pytest.yml +3 -5
  3. {datamaestro_text-2025.4.3/src/datamaestro_text.egg-info → datamaestro_text-2025.5.13}/PKG-INFO +8 -2
  4. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/pyproject.toml +10 -0
  5. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/requirements.txt +1 -1
  6. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/com/github/aagohary/canard.py +19 -12
  7. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/com/github/apple/ml-qrecc.py +6 -8
  8. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/com/sentiment140.py +1 -2
  9. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/edu/stanford/glove.py +1 -0
  10. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/conversation/base.py +10 -8
  11. datamaestro_text-2025.5.13/src/datamaestro_text/data/conversation/canard.py +107 -0
  12. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/conversation/orconvqa.py +0 -1
  13. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/ir/base.py +6 -0
  14. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/ir/formats.py +31 -4
  15. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/datasets/irds/data.py +65 -0
  16. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/datasets/irds/datasets.py +0 -4
  17. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/utils/iter.py +5 -2
  18. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/version.py +2 -2
  19. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13/src/datamaestro_text.egg-info}/PKG-INFO +8 -2
  20. datamaestro_text-2025.5.13/src/datamaestro_text.egg-info/requires.txt +10 -0
  21. datamaestro_text-2025.4.3/src/datamaestro_text/data/conversation/canard.py +0 -68
  22. datamaestro_text-2025.4.3/src/datamaestro_text.egg-info/requires.txt +0 -3
  23. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/.circleci/config.yml +0 -0
  24. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/.github/workflows/python-publish.yml +0 -0
  25. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/.gitignore +0 -0
  26. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/.pre-commit-config.yaml +0 -0
  27. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/.readthedocs.yml +0 -0
  28. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/LICENSE +0 -0
  29. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/MANIFEST.in +0 -0
  30. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/Makefile +0 -0
  31. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/README.md +0 -0
  32. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/docs/Makefile +0 -0
  33. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/docs/make.bat +0 -0
  34. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/docs/requirements.txt +0 -0
  35. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/docs/source/api/conversation.rst +0 -0
  36. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/docs/source/api/embeddings.rst +0 -0
  37. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/docs/source/api/index.rst +0 -0
  38. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/docs/source/api/ir.rst +0 -0
  39. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/docs/source/api/nlp.rst +0 -0
  40. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/docs/source/api/recommendation.rst +0 -0
  41. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/docs/source/api/text.rst +0 -0
  42. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/docs/source/conf.py +0 -0
  43. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/docs/source/datasets/conversation.rst +0 -0
  44. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/docs/source/datasets/embeddings.rst +0 -0
  45. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/docs/source/datasets/index.rst +0 -0
  46. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/docs/source/datasets/ir.rst +0 -0
  47. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/docs/source/datasets/irds.rst +0 -0
  48. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/docs/source/datasets/recommendation.rst +0 -0
  49. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/docs/source/datasets/text.rst +0 -0
  50. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/docs/source/index.rst +0 -0
  51. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/mkdocs.yml +0 -0
  52. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/requirements-dev.txt +0 -0
  53. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/setup.cfg +0 -0
  54. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/__init__.py +0 -0
  55. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/__init__.py +0 -0
  56. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/ai/quac.yaml +0 -0
  57. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml +0 -0
  58. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/com/github/prdwb/orconvqa.py +0 -0
  59. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/com/github/soskek/bookcorpus.yaml +0 -0
  60. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py +0 -0
  61. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/com/microsoft/wikiqa.yaml +0 -0
  62. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/com/oscar-corpus.py +0 -0
  63. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/com/smashwords/bookcorpus.py +0 -0
  64. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/edu/cornell/nlvr.yaml +0 -0
  65. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/edu/stanford/__init__.py +0 -0
  66. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/edu/stanford/aclimdb.py +0 -0
  67. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/edu/stanford/im2p.yaml +0 -0
  68. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/edu/upenn/__init__.py +0 -0
  69. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/edu/upenn/ldc/__init__.py +0 -0
  70. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py +0 -0
  71. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/gov/__init__.py +0 -0
  72. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/gov/nist/__init__.py +0 -0
  73. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/gov/nist/ir/covid.py +0 -0
  74. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/gov/nist/trec/__init__.py +0 -0
  75. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/gov/nist/trec/adhoc.py +0 -0
  76. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/gov/nist/trec/clueweb.yaml +0 -0
  77. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/gov/nist/trec/deeplearning.yaml +0 -0
  78. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/gov/nist/trec/index.yaml +0 -0
  79. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/gov/nist/trec/tipster.py +0 -0
  80. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/gov/nist/trec/web.yaml +0 -0
  81. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/io/github/rajpurkar/squad.yaml +0 -0
  82. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/io/github/thunlp/fewrel.py +0 -0
  83. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/io/metamind/research/__init__.py +0 -0
  84. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/io/metamind/research/wikitext.py +0 -0
  85. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/net/mattmahoney/enwiki.yaml +0 -0
  86. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/org/acm/recsys/cb2014.yaml +0 -0
  87. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/org/cocodataset/index.yaml +0 -0
  88. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/org/grouplens/movielens.py +0 -0
  89. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/org/universaldependencies/french.py +0 -0
  90. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml +0 -0
  91. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/__init__.py +0 -0
  92. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/conversation/__init__.py +0 -0
  93. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/conversation/qrecc.py +0 -0
  94. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/embeddings.py +0 -0
  95. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/ir/__init__.py +0 -0
  96. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/ir/cord19.py +0 -0
  97. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/ir/csv.py +0 -0
  98. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/ir/data.py +0 -0
  99. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/ir/huggingface.py +0 -0
  100. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/ir/stores.py +0 -0
  101. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/ir/trec.py +0 -0
  102. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/ir/utils.py +0 -0
  103. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/recommendation.py +0 -0
  104. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/tagging.py +0 -0
  105. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/text.py +0 -0
  106. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/datasets/irds/__init__.py +0 -0
  107. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/datasets/irds/helpers.py +0 -0
  108. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/datasets/irds/utils.py +0 -0
  109. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/download/tmdb.py +0 -0
  110. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/interfaces/plaintext.py +0 -0
  111. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/interfaces/trec.py +0 -0
  112. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/test/__init__.py +0 -0
  113. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/test/test_datasets.py +0 -0
  114. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/test/test_documented.py +0 -0
  115. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/transforms/__init__.py +0 -0
  116. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/transforms/ir/__init__.py +0 -0
  117. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/utils/__init__.py +0 -0
  118. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/utils/files.py +0 -0
  119. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/utils/randomstream.py +0 -0
  120. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text/utils/shuffle.py +0 -0
  121. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text.egg-info/SOURCES.txt +0 -0
  122. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text.egg-info/dependency_links.txt +0 -0
  123. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text.egg-info/entry_points.txt +0 -0
  124. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/src/datamaestro_text.egg-info/top_level.txt +0 -0
  125. {datamaestro_text-2025.4.3 → datamaestro_text-2025.5.13}/tox.ini +0 -0
@@ -1,5 +1,5 @@
1
1
  [flake8]
2
- ignore = E203, E266, E501, W503, F403, F401
2
+ ignore = E203, E266, E501, W503, F403, F401, E704
3
3
  max-line-length = 79
4
4
  max-complexity = 18
5
5
  select = B,C,E,F,W,T4,B9
@@ -15,20 +15,18 @@ jobs:
15
15
  runs-on: ubuntu-latest
16
16
  strategy:
17
17
  matrix:
18
- python-version: [3.8, 3.9, "3.10", "3.11"]
18
+ python-version: ["3.10", "3.11", "3.12"]
19
19
 
20
20
  steps:
21
21
  - uses: actions/checkout@v2
22
22
  - name: Set up Python ${{ matrix.python-version }}
23
- uses: actions/setup-python@v2
23
+ uses: actions/setup-python@v5
24
24
  with:
25
25
  python-version: ${{ matrix.python-version }}
26
26
  - name: Install dependencies
27
27
  run: |
28
28
  python -m pip install --upgrade pip
29
- pip install flake8 pytest
30
- if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
31
- SETUPTOOLS_SCM_PRETEND_VERSION="0.1-dev" pip install --no-dependencies -e .
29
+ SETUPTOOLS_SCM_PRETEND_VERSION="0.1-dev" pip install -e '.[dev]'
32
30
  - name: Lint with flake8
33
31
  run: |
34
32
  # stop the build if there are Python syntax errors or undefined names
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datamaestro-text
3
- Version: 2025.4.3
3
+ Version: 2025.5.13
4
4
  Summary: Datamaestro module for text-related datasets
5
5
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
6
  License: GPL-3
@@ -18,9 +18,15 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
18
  Requires-Python: >=3.8
19
19
  Description-Content-Type: text/markdown
20
20
  License-File: LICENSE
21
- Requires-Dist: datamaestro>=1.2.1
21
+ Requires-Dist: datamaestro>=1.4.2
22
22
  Requires-Dist: ir_datasets>=0.5.8
23
23
  Requires-Dist: attrs
24
+ Provides-Extra: dev
25
+ Requires-Dist: pytest; extra == "dev"
26
+ Requires-Dist: docutils; extra == "dev"
27
+ Requires-Dist: sphobjinv; extra == "dev"
28
+ Requires-Dist: flake8; extra == "dev"
29
+ Requires-Dist: sphinx; extra == "dev"
24
30
  Dynamic: license-file
25
31
 
26
32
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![PyPI version](https://badge.fury.io/py/datamaestro-text.svg)](https://badge.fury.io/py/datamaestro-text)
@@ -27,6 +27,16 @@ homepage = "https://github.com/experimaestro/datamaestro_text"
27
27
  documentation = "https://datamaestro-text.readthedocs.io/en/latest/"
28
28
  repository = "https://github.com/experimaestro/datamaestro_text"
29
29
 
30
+
31
+ [project.optional-dependencies]
32
+ dev = [
33
+ "pytest",
34
+ "docutils",
35
+ "sphobjinv",
36
+ "flake8",
37
+ "sphinx"
38
+ ]
39
+
30
40
  [tool.setuptools_scm]
31
41
  write_to = "src/datamaestro_text/version.py"
32
42
  fallback_version = "0.0.0-dev"
@@ -1,3 +1,3 @@
1
- datamaestro>=1.2.1
1
+ datamaestro>=1.4.2
2
2
  ir_datasets>=0.5.8
3
3
  attrs
@@ -1,7 +1,5 @@
1
- # See documentation on https://datamaestro.readthedocs.io
2
-
3
1
  from datamaestro.definitions import datatasks, datatags, dataset
4
- from datamaestro.download.archive import zipdownloader
2
+ from datamaestro.download.single import filedownloader
5
3
  from datamaestro.utils import HashCheck
6
4
 
7
5
  from datamaestro.data.ml import Supervised
@@ -10,14 +8,23 @@ from datamaestro_text.data.conversation.canard import CanardDataset
10
8
 
11
9
  @datatags("conversation", "context", "query")
12
10
  @datatasks("query rewriting")
13
- @zipdownloader(
14
- "archive",
15
- "https://obj.umiacs.umd.edu/elgohary/CANARD_Release.zip",
16
- subpath="CANARD_Release",
17
- checker=HashCheck("c9bba7c6bb898f669383415b54fd6ffd"),
11
+ @filedownloader(
12
+ "train.json",
13
+ "https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/train.json",
14
+ checker=HashCheck("73624ac646fb81e09b0fd7f01370ada3"),
15
+ )
16
+ @filedownloader(
17
+ "dev.json",
18
+ "https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/dev.json",
19
+ checker=HashCheck("c84525631a83bc771c58ff31f4a9b601"),
20
+ )
21
+ @filedownloader(
22
+ "test.json",
23
+ "https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/test.json",
24
+ checker=HashCheck("3fc14d0078e7a5056f5da571728f024e"),
18
25
  )
19
26
  @dataset(Supervised, url="https://sites.google.com/view/qanta/projects/canard", id="")
20
- def main(archive):
27
+ def main(train, dev, test):
21
28
  """Question-in-context rewriting
22
29
 
23
30
  CANARD is a dataset for question-in-context rewriting that consists of
@@ -30,7 +37,7 @@ def main(archive):
30
37
  Each dataset is an instance of :class:`datamaestro_text.data.conversation.CanardDataset`
31
38
  """
32
39
  return {
33
- "train": CanardDataset(path=archive / "train.json"),
34
- "validation": CanardDataset(path=archive / "dev.json"),
35
- "test": CanardDataset(path=archive / "test.json"),
40
+ "train": CanardDataset(path=train),
41
+ "validation": CanardDataset(path=dev),
42
+ "test": CanardDataset(path=test),
36
43
  }
@@ -25,12 +25,11 @@ from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
25
25
  checker=HashCheck("f88fcc7ef3678cd6312080389c8abd67"),
26
26
  )
27
27
  @dataset(
28
- Supervised[QReCCDataset, None, QReCCDataset],
29
28
  url="https://github.com/apple/ml-qrecc",
30
29
  doi="https://doi.org/10.48550/arXiv.2010.04898",
31
30
  id="",
32
31
  )
33
- def main(data: Path):
32
+ def main(data: Path) -> Supervised[QReCCDataset, None, QReCCDataset]:
34
33
  """Open-Domain Question Answering Goes Conversational via Question Rewriting
35
34
 
36
35
  We introduce QReCC (Question Rewriting in Conversational Context), an
@@ -40,10 +39,10 @@ def main(data: Path):
40
39
  answering that includes the individual subtasks of question rewriting,
41
40
  passage retrieval and reading comprehension
42
41
  """
43
- return {
44
- "train": QReCCDataset(path=data / "qrecc_train.json"),
45
- "test": QReCCDataset(path=data / "qrecc_test.json"),
46
- }
42
+ return Supervised(
43
+ train=QReCCDataset(path=data / "qrecc_train.json"),
44
+ test=QReCCDataset(path=data / "qrecc_test.json"),
45
+ )
47
46
 
48
47
 
49
48
  @dataset(
@@ -52,7 +51,6 @@ def main(data: Path):
52
51
  )
53
52
  class Content(LZ4JSONLDocumentStore):
54
53
  """QReCC mentionned URLs content"""
55
-
56
54
  @staticmethod
57
55
  def __create_dataset__(dataset, options=None):
58
56
  ds = reference(reference=main).setup(dataset, options)
@@ -67,7 +65,7 @@ class Content(LZ4JSONLDocumentStore):
67
65
  "id",
68
66
  ).setup(dataset, options)
69
67
 
70
- return LZ4JSONLDocumentStore(jsonl_path=store_path)
68
+ return Content(jsonl_path=store_path)
71
69
 
72
70
  @staticmethod
73
71
  def _documents(path: Path):
@@ -1,11 +1,10 @@
1
1
  from datamaestro.data.csv import Generic
2
- from datamaestro.definitions import argument, datatasks, datatags, dataset
2
+ from datamaestro.definitions import datatasks, datatags, dataset
3
3
  from datamaestro.download.archive import zipdownloader
4
4
  from datamaestro.data.ml import Supervised
5
5
  from datamaestro.utils import HashCheck
6
6
 
7
7
 
8
-
9
8
  @zipdownloader(
10
9
  "dir",
11
10
  "http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip",
@@ -11,6 +11,7 @@ from datamaestro.download.archive import zipdownloader
11
11
  from datamaestro.download.single import filedownloader
12
12
  from datamaestro_text.data.embeddings import WordEmbeddingsText
13
13
 
14
+
14
15
  # size: 822M
15
16
  # statistics:
16
17
  # tokens: 6G
@@ -129,6 +129,8 @@ class ConversationNode:
129
129
 
130
130
 
131
131
  class ConversationTree(ABC):
132
+ """Represents a conversation tree"""
133
+
132
134
  @abstractmethod
133
135
  def root(self) -> ConversationNode:
134
136
  ...
@@ -192,7 +194,7 @@ class SingleConversationTreeNode(ConversationNode):
192
194
  return (
193
195
  SingleConversationTreeNode(self.tree, self.index + 1)
194
196
  if self.index < len(self.tree.history) - 1
195
- else []
197
+ else None
196
198
  )
197
199
 
198
200
  def children(self) -> List[ConversationNode]:
@@ -207,8 +209,8 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
207
209
  """A conversation tree node"""
208
210
 
209
211
  entry: Record
210
- parent: Optional["ConversationTreeNode"]
211
- children: List["ConversationTreeNode"]
212
+ _parent: Optional["ConversationTreeNode"]
213
+ _children: List["ConversationTreeNode"]
212
214
 
213
215
  def __init__(self, entry):
214
216
  self.entry = entry
@@ -216,8 +218,8 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
216
218
  self.children = []
217
219
 
218
220
  def add(self, node: "ConversationTreeNode") -> "ConversationTreeNode":
219
- self.children.append(node)
220
- node.parent = self
221
+ self._children.append(node)
222
+ node._parent = self
221
223
  return node
222
224
 
223
225
  def conversation(self, skip_self: bool) -> ConversationHistory:
@@ -232,14 +234,14 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
232
234
  def __iter__(self) -> Iterator["ConversationTreeNode"]:
233
235
  """Iterates over all conversation tree nodes (pre-order)"""
234
236
  yield self.entry
235
- for child in self.children:
237
+ for child in self._children:
236
238
  yield from child
237
239
 
238
240
  def parent(self) -> Optional[ConversationNode]:
239
- return self.parent
241
+ return self._parent
240
242
 
241
243
  def children(self) -> List[ConversationNode]:
242
- return self.children
244
+ return self._children
243
245
 
244
246
  def root(self):
245
247
  return self
@@ -0,0 +1,107 @@
1
+ from typing import Iterator, List
2
+ from attr import define
3
+ import json
4
+ from datamaestro.record import Record
5
+ from datamaestro.data import File
6
+ from datamaestro_text.data.conversation.base import (
7
+ ConversationDataset,
8
+ ConversationTree,
9
+ SingleConversationTree,
10
+ SimpleDecontextualizedItem,
11
+ EntryType,
12
+ )
13
+ from datamaestro_text.data.ir import IDItem, SimpleTextItem
14
+ import logging
15
+
16
+
17
+ @define(kw_only=True)
18
+ class CanardConversation:
19
+ """A query with past history"""
20
+
21
+ history: List[str]
22
+ """The list of queries asked by the user"""
23
+
24
+ query: str
25
+ """The last issued query"""
26
+
27
+ rewrite: str
28
+ """Manually rewritten query"""
29
+
30
+ dialogue_id: str
31
+ """Conversation identifier"""
32
+
33
+ query_no: int
34
+ """Question number"""
35
+
36
+
37
+ class CanardDataset(ConversationDataset, File):
38
+ """A dataset in the CANARD JSON format
39
+
40
+ The CANARD dataset is composed of
41
+ """
42
+
43
+ def entries(self) -> Iterator[CanardConversation]:
44
+ """Iterates over re-written query with their context"""
45
+ with self.path.open("rt") as fp:
46
+ data = json.load(fp)
47
+
48
+ for entry in data:
49
+ yield CanardConversation(
50
+ history=entry["History"],
51
+ query=entry["Question"],
52
+ rewrite=entry["Rewrite"],
53
+ dialogue_id=entry["QuAC_dialog_id"],
54
+ query_no=entry["Question_no"],
55
+ )
56
+
57
+ def __iter__(self) -> Iterator[ConversationTree]:
58
+ history: list[Record] = []
59
+ current_id = None
60
+
61
+ for entry in self.entries():
62
+ # Check if current conversation, otherwise we are OK
63
+ if current_id != entry.dialogue_id:
64
+ if current_id is not None:
65
+ history.reverse()
66
+ yield SingleConversationTree(current_id, history)
67
+ history = []
68
+
69
+ current_id = entry.dialogue_id
70
+
71
+ if not history:
72
+ # First round
73
+ # The two first items are the wikipedia title and section,
74
+ # we interpret them as two user queries
75
+ assert len(entry.history) == 2
76
+ history.extend(
77
+ Record(
78
+ SimpleTextItem(text),
79
+ EntryType.USER_QUERY,
80
+ )
81
+ for text in entry.history
82
+ )
83
+ else:
84
+ # The utterance before the last is the last user query
85
+ assert (
86
+ entry.history[-2] == history[-1][SimpleTextItem].text
87
+ ), f"{entry.dialogue_id} {entry.history} / {history[-4:-1]}"
88
+
89
+ # The last utterance is the system side
90
+ history.append(
91
+ Record(SimpleTextItem(entry.history[-1]), EntryType.SYSTEM_ANSWER)
92
+ )
93
+
94
+ assert len(entry.history) == len(history)
95
+
96
+ # Add to current
97
+ history.append(
98
+ Record(
99
+ IDItem(f"{entry.dialogue_id}-{entry.query_no}"),
100
+ SimpleTextItem(entry.query),
101
+ SimpleDecontextualizedItem(entry.rewrite),
102
+ EntryType.USER_QUERY,
103
+ )
104
+ )
105
+
106
+ if current_id:
107
+ yield SingleConversationTree(current_id, history)
@@ -1,4 +1,3 @@
1
- from functools import cached_property
2
1
  from typing import Iterator, List, Optional
3
2
  from attr import define
4
3
  import json
@@ -43,6 +43,12 @@ class IDItem(Item, ABC):
43
43
 
44
44
  id: str
45
45
 
46
+ @define
47
+ class UrlItem(Item):
48
+ """An url item"""
49
+
50
+ url: str
51
+
46
52
 
47
53
  @define
48
54
  class AdhocAssessment:
@@ -58,7 +58,7 @@ class NFCorpusDocument(TextItem):
58
58
 
59
59
  @cached_property
60
60
  def text(self):
61
- return self.abstract
61
+ return f"{self.title} {self.abstract}"
62
62
 
63
63
 
64
64
  @define
@@ -99,8 +99,8 @@ class WapoDocument(TextItem):
99
99
  body_media: Tuple[WapoDocMedia, ...]
100
100
 
101
101
  @cached_property
102
- def text(self):
103
- return self.body
102
+ def text(self):
103
+ return f"{self.title} {self.body_paras_html}"
104
104
 
105
105
 
106
106
  @define
@@ -126,13 +126,24 @@ class OrConvQADocument(TextItem):
126
126
  def text(self):
127
127
  return f"{self.title} {self.body}"
128
128
 
129
+
129
130
  @define
131
+ class DprW100Doc(TextItem):
132
+ text: str
133
+ title: str
134
+
135
+ @define
136
+ class MsMarcoV2Passage(TextItem):
137
+ text: str
138
+ spans: Tuple[Tuple[int, int], ...]
139
+ msmarco_document_id: str
130
140
  class Touche2020(TextItem):
131
141
  text: str
132
142
  title: str
133
143
  stance: str
134
144
  url: str
135
145
 
146
+
136
147
  @define
137
148
  class SciDocs(TextItem):
138
149
  text: str
@@ -175,7 +186,8 @@ class TrecMb14Query(TextItem):
175
186
  def get_text(self):
176
187
  return f"{self.query}"
177
188
 
178
- @define
189
+
190
+ @define
179
191
  class SciDocsTopic(TextItem):
180
192
  text: str
181
193
  authors: List[str]
@@ -183,6 +195,7 @@ class SciDocsTopic(TextItem):
183
195
  cited_by: List[str]
184
196
  references: List[str]
185
197
 
198
+
186
199
  @define()
187
200
  class TrecTopic(SimpleTextItem):
188
201
  description: str
@@ -190,3 +203,17 @@ class TrecTopic(SimpleTextItem):
190
203
 
191
204
 
192
205
  TrecTopicRecord = record_type(IDItem, TrecTopic)
206
+
207
+ @define
208
+ class DprW100Query(TextItem):
209
+ text: str
210
+ answers: Tuple[str]
211
+
212
+ @define
213
+ class TrecBackgroundLinkingQuery(IDItem):
214
+ query_id: str
215
+ doc_id: str
216
+ url: str
217
+
218
+ def get_text(self):
219
+ raise NotImplementedError()
@@ -37,6 +37,7 @@ from datamaestro_text.data.ir.base import (
37
37
  SimpleAdhocAssessment,
38
38
  SimpleTextItem,
39
39
  TopicRecord,
40
+ UrlItem,
40
41
  create_record,
41
42
  )
42
43
 
@@ -165,6 +166,19 @@ class Documents(ir.DocumentStore, IRDSId):
165
166
  "source",
166
167
  "source_content_type",
167
168
  ),
169
+ _irds.dpr_w100.DprW100Doc: tuple_constructor(
170
+ formats.DprW100Doc,
171
+ "doc_id",
172
+ "text",
173
+ "title",
174
+ ),
175
+ _irds.msmarco_passage_v2.MsMarcoV2Passage: tuple_constructor(
176
+ formats.MsMarcoV2Passage,
177
+ "doc_id",
178
+ "text",
179
+ "spans",
180
+ "msmarco_document_id",
181
+ ),
168
182
  }
169
183
 
170
184
  """Wraps an ir datasets collection -- and provide a default text
@@ -385,6 +399,12 @@ class Topics(ir.TopicsStore, IRDSId):
385
399
  "tweet_time",
386
400
  "description",
387
401
  ),
402
+ _irds.dpr_w100.DprW100Query: tuple_constructor(
403
+ formats.DprW100Query,
404
+ "query_id",
405
+ "text",
406
+ "answers"
407
+ ),
388
408
  }
389
409
 
390
410
  HANDLERS = {
@@ -415,7 +435,52 @@ class Topics(ir.TopicsStore, IRDSId):
415
435
  def iter(self) -> Iterator[TopicRecord]:
416
436
  """Returns an iterator over topics"""
417
437
  return self.handler.iter()
438
+
439
+ class TrecBackgroundLinkingTopicsHandler(TopicsHandler):
440
+ def __init__(self, dataset):
441
+ self.dataset = dataset
442
+
443
+ @cached_property
444
+ def ext2records(self):
445
+ return {record[IDItem].id: record for record in self.records}
446
+
447
+ def topic_int(self, internal_topic_id: int) -> TopicRecord:
448
+ """Returns a document given its internal ID"""
449
+ return self.records[internal_topic_id]
450
+
451
+ def topic_ext(self, external_topic_id: str) -> TopicRecord:
452
+ """Returns a document given its external ID"""
453
+ return self.ext2records[external_topic_id]
418
454
 
455
+ def iter(self) -> Iterator[ir.TopicRecord]:
456
+ """Returns an iterator over topics"""
457
+ return iter(self.records)
458
+
459
+ @cached_property
460
+ def records(self):
461
+ try:
462
+ records = []
463
+
464
+ for query in self.dataset.dataset.queries_iter():
465
+ topic = Record(
466
+ IDItem(query.query_id),
467
+ # Following BEIR documentation, we use title of documents as queries: https://github.com/beir-cellar/beir/blob/main/examples/dataset/README.md#queries-and-qrels
468
+ SimpleTextItem(self.dataset.dataset.docs_store().get(query.doc_id).title),
469
+ UrlItem(query.url),
470
+ )
471
+ records.append(topic)
472
+ except Exception:
473
+ logging.exception("Error while computing topic records")
474
+ raise
475
+
476
+ return records
477
+
478
+
479
+ Topics.HANDLERS.update(
480
+ {
481
+ _irds.wapo.TrecBackgroundLinkingQuery: TrecBackgroundLinkingTopicsHandler
482
+ }
483
+ )
419
484
 
420
485
  class CastTopicsHandler(TopicsHandler):
421
486
  def __init__(self, dataset):
@@ -116,10 +116,6 @@ class AdhocRunDataset(Dataset):
116
116
  def _prepare(self, download=False) -> AdhocRun:
117
117
  return AdhocRun(id=self.fullid)
118
118
 
119
- @property
120
- def configtype(self):
121
- return AdhocRun
122
-
123
119
 
124
120
  class Collection(Dataset):
125
121
  base = Adhoc
@@ -1,4 +1,4 @@
1
- from typing import Callable, TypeVar, Iterator, List, Union
1
+ from typing import Callable, Sequence, TypeVar, Iterator, List, Union
2
2
 
3
3
  T = TypeVar("T")
4
4
 
@@ -45,7 +45,7 @@ class RangeView:
45
45
  return RangeView(self.source, key)
46
46
 
47
47
 
48
- class LazyList:
48
+ class LazyList(Sequence):
49
49
  """Iterable-based list
50
50
 
51
51
  The list is only materialized if needed"""
@@ -63,6 +63,9 @@ class LazyList:
63
63
  else:
64
64
  return iter(self.materialized_list)
65
65
 
66
+ def __len__(self):
67
+ return len(self.iterable)
68
+
66
69
  def __getitem__(self, index):
67
70
  # Materialize the list if accessing an index above the threshold or any slice
68
71
  if isinstance(index, slice) or index >= self.materialize_threshold:
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '2025.4.3'
21
- __version_tuple__ = version_tuple = (2025, 4, 3)
20
+ __version__ = version = '2025.5.13'
21
+ __version_tuple__ = version_tuple = (2025, 5, 13)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datamaestro-text
3
- Version: 2025.4.3
3
+ Version: 2025.5.13
4
4
  Summary: Datamaestro module for text-related datasets
5
5
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
6
  License: GPL-3
@@ -18,9 +18,15 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
18
  Requires-Python: >=3.8
19
19
  Description-Content-Type: text/markdown
20
20
  License-File: LICENSE
21
- Requires-Dist: datamaestro>=1.2.1
21
+ Requires-Dist: datamaestro>=1.4.2
22
22
  Requires-Dist: ir_datasets>=0.5.8
23
23
  Requires-Dist: attrs
24
+ Provides-Extra: dev
25
+ Requires-Dist: pytest; extra == "dev"
26
+ Requires-Dist: docutils; extra == "dev"
27
+ Requires-Dist: sphobjinv; extra == "dev"
28
+ Requires-Dist: flake8; extra == "dev"
29
+ Requires-Dist: sphinx; extra == "dev"
24
30
  Dynamic: license-file
25
31
 
26
32
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![PyPI version](https://badge.fury.io/py/datamaestro-text.svg)](https://badge.fury.io/py/datamaestro-text)
@@ -0,0 +1,10 @@
1
+ datamaestro>=1.4.2
2
+ ir_datasets>=0.5.8
3
+ attrs
4
+
5
+ [dev]
6
+ pytest
7
+ docutils
8
+ sphobjinv
9
+ flake8
10
+ sphinx
@@ -1,68 +0,0 @@
1
- from typing import Iterator, List
2
- from attr import define
3
- import json
4
- from datamaestro.data import File
5
- from .base import (
6
- ConversationTree,
7
- SingleConversationTree,
8
- )
9
- from . import ConversationDataset
10
-
11
-
12
- @define(kw_only=True)
13
- class CanardConversation:
14
- """A query with past history"""
15
-
16
- history: List[str]
17
- """The list of queries asked by the user"""
18
-
19
- query: str
20
- """The last issued query"""
21
-
22
- rewrite: str
23
- """Manually rewritten query"""
24
-
25
- dialogue_id: str
26
- """Conversation identifier"""
27
-
28
- query_no: int
29
- """Question number"""
30
-
31
-
32
- class CanardDataset(ConversationDataset, File):
33
- """A dataset in the CANARD JSON format"""
34
-
35
- def entries(self) -> Iterator[CanardConversation]:
36
- """Iterates over re-written query with their context"""
37
- with self.path.open("rt") as fp:
38
- data = json.load(fp)
39
-
40
- for entry in data:
41
- yield CanardConversation(
42
- history=entry["History"],
43
- query=entry["Question"],
44
- rewrite=entry["Rewrite"],
45
- dialogue_id=entry["QuAC_dialog_id"],
46
- query_no=entry["Question_no"],
47
- )
48
-
49
- def __iter__(self) -> Iterator[ConversationTree]:
50
- history = []
51
- current_id = None
52
-
53
- for entry in self.entries():
54
- # Check if current conversation
55
- if current_id != entry.dialogue_id and current_id is not None:
56
- history.reverse()
57
- yield SingleConversationTree(current_id, history)
58
-
59
- # Add to current
60
- history.append(
61
- # FIXME: not working anymore
62
- CanardEntry(
63
- query=entry.query,
64
- decontextualized_query=entry.rewrite,
65
- )
66
- )
67
-
68
- yield current
@@ -1,3 +0,0 @@
1
- datamaestro>=1.2.1
2
- ir_datasets>=0.5.8
3
- attrs