datamaestro-text 2025.1.7__tar.gz → 2025.5.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/.flake8 +1 -1
  2. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/.github/workflows/pytest.yml +3 -5
  3. {datamaestro_text-2025.1.7/src/datamaestro_text.egg-info → datamaestro_text-2025.5.13}/PKG-INFO +10 -3
  4. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/pyproject.toml +10 -0
  5. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/requirements.txt +1 -1
  6. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/com/github/aagohary/canard.py +19 -12
  7. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/com/github/apple/ml-qrecc.py +6 -8
  8. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py +8 -7
  9. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/com/sentiment140.py +1 -6
  10. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/edu/stanford/glove.py +1 -0
  11. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py +3 -4
  12. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/gov/nist/trec/tipster.py +1 -1
  13. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/conversation/base.py +10 -8
  14. datamaestro_text-2025.5.13/src/datamaestro_text/data/conversation/canard.py +107 -0
  15. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/conversation/orconvqa.py +0 -1
  16. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/embeddings.py +3 -3
  17. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/ir/base.py +6 -0
  18. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/ir/cord19.py +2 -1
  19. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/ir/formats.py +31 -4
  20. datamaestro_text-2025.5.13/src/datamaestro_text/data/recommendation.py +13 -0
  21. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/text.py +6 -6
  22. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/datasets/irds/data.py +65 -0
  23. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/datasets/irds/datasets.py +0 -4
  24. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/utils/iter.py +5 -2
  25. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/version.py +9 -4
  26. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13/src/datamaestro_text.egg-info}/PKG-INFO +10 -3
  27. datamaestro_text-2025.5.13/src/datamaestro_text.egg-info/requires.txt +10 -0
  28. datamaestro_text-2025.1.7/src/datamaestro_text/data/conversation/canard.py +0 -68
  29. datamaestro_text-2025.1.7/src/datamaestro_text/data/recommendation.py +0 -14
  30. datamaestro_text-2025.1.7/src/datamaestro_text.egg-info/requires.txt +0 -3
  31. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/.circleci/config.yml +0 -0
  32. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/.github/workflows/python-publish.yml +0 -0
  33. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/.gitignore +0 -0
  34. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/.pre-commit-config.yaml +0 -0
  35. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/.readthedocs.yml +0 -0
  36. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/LICENSE +0 -0
  37. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/MANIFEST.in +0 -0
  38. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/Makefile +0 -0
  39. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/README.md +0 -0
  40. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/docs/Makefile +0 -0
  41. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/docs/make.bat +0 -0
  42. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/docs/requirements.txt +0 -0
  43. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/docs/source/api/conversation.rst +0 -0
  44. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/docs/source/api/embeddings.rst +0 -0
  45. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/docs/source/api/index.rst +0 -0
  46. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/docs/source/api/ir.rst +0 -0
  47. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/docs/source/api/nlp.rst +0 -0
  48. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/docs/source/api/recommendation.rst +0 -0
  49. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/docs/source/api/text.rst +0 -0
  50. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/docs/source/conf.py +0 -0
  51. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/docs/source/datasets/conversation.rst +0 -0
  52. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/docs/source/datasets/embeddings.rst +0 -0
  53. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/docs/source/datasets/index.rst +0 -0
  54. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/docs/source/datasets/ir.rst +0 -0
  55. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/docs/source/datasets/irds.rst +0 -0
  56. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/docs/source/datasets/recommendation.rst +0 -0
  57. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/docs/source/datasets/text.rst +0 -0
  58. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/docs/source/index.rst +0 -0
  59. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/mkdocs.yml +0 -0
  60. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/requirements-dev.txt +0 -0
  61. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/setup.cfg +0 -0
  62. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/__init__.py +0 -0
  63. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/__init__.py +0 -0
  64. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/ai/quac.yaml +0 -0
  65. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml +0 -0
  66. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/com/github/prdwb/orconvqa.py +0 -0
  67. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/com/github/soskek/bookcorpus.yaml +0 -0
  68. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/com/microsoft/wikiqa.yaml +0 -0
  69. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/com/oscar-corpus.py +0 -0
  70. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/com/smashwords/bookcorpus.py +0 -0
  71. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/edu/cornell/nlvr.yaml +0 -0
  72. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/edu/stanford/__init__.py +0 -0
  73. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/edu/stanford/aclimdb.py +0 -0
  74. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/edu/stanford/im2p.yaml +0 -0
  75. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/edu/upenn/__init__.py +0 -0
  76. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/edu/upenn/ldc/__init__.py +0 -0
  77. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/gov/__init__.py +0 -0
  78. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/gov/nist/__init__.py +0 -0
  79. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/gov/nist/ir/covid.py +0 -0
  80. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/gov/nist/trec/__init__.py +0 -0
  81. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/gov/nist/trec/adhoc.py +0 -0
  82. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/gov/nist/trec/clueweb.yaml +0 -0
  83. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/gov/nist/trec/deeplearning.yaml +0 -0
  84. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/gov/nist/trec/index.yaml +0 -0
  85. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/gov/nist/trec/web.yaml +0 -0
  86. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/io/github/rajpurkar/squad.yaml +0 -0
  87. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/io/github/thunlp/fewrel.py +0 -0
  88. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/io/metamind/research/__init__.py +0 -0
  89. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/io/metamind/research/wikitext.py +0 -0
  90. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/net/mattmahoney/enwiki.yaml +0 -0
  91. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/org/acm/recsys/cb2014.yaml +0 -0
  92. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/org/cocodataset/index.yaml +0 -0
  93. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/org/grouplens/movielens.py +0 -0
  94. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/org/universaldependencies/french.py +0 -0
  95. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml +0 -0
  96. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/__init__.py +0 -0
  97. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/conversation/__init__.py +0 -0
  98. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/conversation/qrecc.py +0 -0
  99. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/ir/__init__.py +0 -0
  100. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/ir/csv.py +0 -0
  101. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/ir/data.py +0 -0
  102. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/ir/huggingface.py +0 -0
  103. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/ir/stores.py +0 -0
  104. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/ir/trec.py +0 -0
  105. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/ir/utils.py +0 -0
  106. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/tagging.py +0 -0
  107. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/datasets/irds/__init__.py +0 -0
  108. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/datasets/irds/helpers.py +0 -0
  109. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/datasets/irds/utils.py +0 -0
  110. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/download/tmdb.py +0 -0
  111. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/interfaces/plaintext.py +0 -0
  112. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/interfaces/trec.py +0 -0
  113. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/test/__init__.py +0 -0
  114. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/test/test_datasets.py +0 -0
  115. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/test/test_documented.py +0 -0
  116. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/transforms/__init__.py +0 -0
  117. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/transforms/ir/__init__.py +0 -0
  118. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/utils/__init__.py +0 -0
  119. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/utils/files.py +0 -0
  120. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/utils/randomstream.py +0 -0
  121. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/utils/shuffle.py +0 -0
  122. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text.egg-info/SOURCES.txt +0 -0
  123. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text.egg-info/dependency_links.txt +0 -0
  124. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text.egg-info/entry_points.txt +0 -0
  125. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text.egg-info/top_level.txt +0 -0
  126. {datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/tox.ini +0 -0
@@ -1,5 +1,5 @@
1
1
  [flake8]
2
- ignore = E203, E266, E501, W503, F403, F401
2
+ ignore = E203, E266, E501, W503, F403, F401, E704
3
3
  max-line-length = 79
4
4
  max-complexity = 18
5
5
  select = B,C,E,F,W,T4,B9
@@ -15,20 +15,18 @@ jobs:
15
15
  runs-on: ubuntu-latest
16
16
  strategy:
17
17
  matrix:
18
- python-version: [3.8, 3.9, "3.10", "3.11"]
18
+ python-version: ["3.10", "3.11", "3.12"]
19
19
 
20
20
  steps:
21
21
  - uses: actions/checkout@v2
22
22
  - name: Set up Python ${{ matrix.python-version }}
23
- uses: actions/setup-python@v2
23
+ uses: actions/setup-python@v5
24
24
  with:
25
25
  python-version: ${{ matrix.python-version }}
26
26
  - name: Install dependencies
27
27
  run: |
28
28
  python -m pip install --upgrade pip
29
- pip install flake8 pytest
30
- if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
31
- SETUPTOOLS_SCM_PRETEND_VERSION="0.1-dev" pip install --no-dependencies -e .
29
+ SETUPTOOLS_SCM_PRETEND_VERSION="0.1-dev" pip install -e '.[dev]'
32
30
  - name: Lint with flake8
33
31
  run: |
34
32
  # stop the build if there are Python syntax errors or undefined names
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: datamaestro-text
3
- Version: 2025.1.7
3
+ Version: 2025.5.13
4
4
  Summary: Datamaestro module for text-related datasets
5
5
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
6
  License: GPL-3
@@ -18,9 +18,16 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
18
  Requires-Python: >=3.8
19
19
  Description-Content-Type: text/markdown
20
20
  License-File: LICENSE
21
- Requires-Dist: datamaestro>=1.2.1
21
+ Requires-Dist: datamaestro>=1.4.2
22
22
  Requires-Dist: ir_datasets>=0.5.8
23
23
  Requires-Dist: attrs
24
+ Provides-Extra: dev
25
+ Requires-Dist: pytest; extra == "dev"
26
+ Requires-Dist: docutils; extra == "dev"
27
+ Requires-Dist: sphobjinv; extra == "dev"
28
+ Requires-Dist: flake8; extra == "dev"
29
+ Requires-Dist: sphinx; extra == "dev"
30
+ Dynamic: license-file
24
31
 
25
32
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![PyPI version](https://badge.fury.io/py/datamaestro-text.svg)](https://badge.fury.io/py/datamaestro-text)
26
33
 
@@ -27,6 +27,16 @@ homepage = "https://github.com/experimaestro/datamaestro_text"
27
27
  documentation = "https://datamaestro-text.readthedocs.io/en/latest/"
28
28
  repository = "https://github.com/experimaestro/datamaestro_text"
29
29
 
30
+
31
+ [project.optional-dependencies]
32
+ dev = [
33
+ "pytest",
34
+ "docutils",
35
+ "sphobjinv",
36
+ "flake8",
37
+ "sphinx"
38
+ ]
39
+
30
40
  [tool.setuptools_scm]
31
41
  write_to = "src/datamaestro_text/version.py"
32
42
  fallback_version = "0.0.0-dev"
@@ -1,3 +1,3 @@
1
- datamaestro>=1.2.1
1
+ datamaestro>=1.4.2
2
2
  ir_datasets>=0.5.8
3
3
  attrs
@@ -1,7 +1,5 @@
1
- # See documentation on https://datamaestro.readthedocs.io
2
-
3
1
  from datamaestro.definitions import datatasks, datatags, dataset
4
- from datamaestro.download.archive import zipdownloader
2
+ from datamaestro.download.single import filedownloader
5
3
  from datamaestro.utils import HashCheck
6
4
 
7
5
  from datamaestro.data.ml import Supervised
@@ -10,14 +8,23 @@ from datamaestro_text.data.conversation.canard import CanardDataset
10
8
 
11
9
  @datatags("conversation", "context", "query")
12
10
  @datatasks("query rewriting")
13
- @zipdownloader(
14
- "archive",
15
- "https://obj.umiacs.umd.edu/elgohary/CANARD_Release.zip",
16
- subpath="CANARD_Release",
17
- checker=HashCheck("c9bba7c6bb898f669383415b54fd6ffd"),
11
+ @filedownloader(
12
+ "train.json",
13
+ "https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/train.json",
14
+ checker=HashCheck("73624ac646fb81e09b0fd7f01370ada3"),
15
+ )
16
+ @filedownloader(
17
+ "dev.json",
18
+ "https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/dev.json",
19
+ checker=HashCheck("c84525631a83bc771c58ff31f4a9b601"),
20
+ )
21
+ @filedownloader(
22
+ "test.json",
23
+ "https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/test.json",
24
+ checker=HashCheck("3fc14d0078e7a5056f5da571728f024e"),
18
25
  )
19
26
  @dataset(Supervised, url="https://sites.google.com/view/qanta/projects/canard", id="")
20
- def main(archive):
27
+ def main(train, dev, test):
21
28
  """Question-in-context rewriting
22
29
 
23
30
  CANARD is a dataset for question-in-context rewriting that consists of
@@ -30,7 +37,7 @@ def main(archive):
30
37
  Each dataset is an instance of :class:`datamaestro_text.data.conversation.CanardDataset`
31
38
  """
32
39
  return {
33
- "train": CanardDataset(path=archive / "train.json"),
34
- "validation": CanardDataset(path=archive / "dev.json"),
35
- "test": CanardDataset(path=archive / "test.json"),
40
+ "train": CanardDataset(path=train),
41
+ "validation": CanardDataset(path=dev),
42
+ "test": CanardDataset(path=test),
36
43
  }
@@ -25,12 +25,11 @@ from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
25
25
  checker=HashCheck("f88fcc7ef3678cd6312080389c8abd67"),
26
26
  )
27
27
  @dataset(
28
- Supervised[QReCCDataset, None, QReCCDataset],
29
28
  url="https://github.com/apple/ml-qrecc",
30
29
  doi="https://doi.org/10.48550/arXiv.2010.04898",
31
30
  id="",
32
31
  )
33
- def main(data: Path):
32
+ def main(data: Path) -> Supervised[QReCCDataset, None, QReCCDataset]:
34
33
  """Open-Domain Question Answering Goes Conversational via Question Rewriting
35
34
 
36
35
  We introduce QReCC (Question Rewriting in Conversational Context), an
@@ -40,10 +39,10 @@ def main(data: Path):
40
39
  answering that includes the individual subtasks of question rewriting,
41
40
  passage retrieval and reading comprehension
42
41
  """
43
- return {
44
- "train": QReCCDataset(path=data / "qrecc_train.json"),
45
- "test": QReCCDataset(path=data / "qrecc_test.json"),
46
- }
42
+ return Supervised(
43
+ train=QReCCDataset(path=data / "qrecc_train.json"),
44
+ test=QReCCDataset(path=data / "qrecc_test.json"),
45
+ )
47
46
 
48
47
 
49
48
  @dataset(
@@ -52,7 +51,6 @@ def main(data: Path):
52
51
  )
53
52
  class Content(LZ4JSONLDocumentStore):
54
53
  """QReCC mentionned URLs content"""
55
-
56
54
  @staticmethod
57
55
  def __create_dataset__(dataset, options=None):
58
56
  ds = reference(reference=main).setup(dataset, options)
@@ -67,7 +65,7 @@ class Content(LZ4JSONLDocumentStore):
67
65
  "id",
68
66
  ).setup(dataset, options)
69
67
 
70
- return LZ4JSONLDocumentStore(jsonl_path=store_path)
68
+ return Content(jsonl_path=store_path)
71
69
 
72
70
  @staticmethod
73
71
  def _documents(path: Path):
@@ -1,11 +1,11 @@
1
1
  """MS MARCO (Microsoft Machine Reading Comprehension) is a large scale dataset focused on machine reading comprehension, question answering, and passage ranking. A variant of this task will be the part of TREC and AFIRM 2019. For Updates about TREC 2019 please follow This Repository Passage Reranking task Task Given a query q and a the 1000 most relevant passages P = p1, p2, p3,... p1000, as retrieved by BM25 a succeful system is expected to rerank the most relevant passage as high as possible. For this task not all 1000 relevant items have a human labeled relevant passage. Evaluation will be done using MRR.
2
2
 
3
- **Publication**:
4
- Tri Nguyen, Mir Rosenberg, Xia Song, Jianfeng Gao, Saurabh Tiwary, RanganMajumder, and Li Deng. 2016.
5
- MS MARCO: A Human Generated MAchineReading COmprehension Dataset. In CoCo@NIPS.
3
+ **Publication**:
4
+ Tri Nguyen, Mir Rosenberg, Xia Song, Jianfeng Gao, Saurabh Tiwary, RanganMajumder, and Li Deng. 2016.
5
+ MS MARCO: A Human Generated MAchineReading COmprehension Dataset. In CoCo@NIPS.
6
6
 
7
7
 
8
- See [https://github.com/microsoft/MSMARCO-Passage-Ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) for more details
8
+ See [https://github.com/microsoft/MSMARCO-Passage-Ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) for more details
9
9
  """
10
10
 
11
11
  from datamaestro.annotations.agreement import useragreement
@@ -35,6 +35,7 @@ http://www.msmarco.org/dataset.aspx""",
35
35
 
36
36
  # --- Document collection
37
37
 
38
+
38
39
  # TODO: Not ideal since it would be better to have small versions right away
39
40
  # instead of downloading again the MS Marco Collection
40
41
  @lua
@@ -43,10 +44,10 @@ http://www.msmarco.org/dataset.aspx""",
43
44
  url="https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz",
44
45
  checker=HashCheck("31644046b18952c1386cd4564ba2ae69", md5),
45
46
  )
46
- @dataset(Folder, url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
47
- def collection_etc(data):
47
+ @dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
48
+ def collection_etc(data) -> Folder:
48
49
  """Documents and some more files"""
49
- return {"path": data}
50
+ return Folder(path=data)
50
51
 
51
52
 
52
53
  @lua
@@ -1,14 +1,9 @@
1
1
  from datamaestro.data.csv import Generic
2
- from datamaestro.definitions import argument, datatasks, datatags, dataset
2
+ from datamaestro.definitions import datatasks, datatags, dataset
3
3
  from datamaestro.download.archive import zipdownloader
4
4
  from datamaestro.data.ml import Supervised
5
5
  from datamaestro.utils import HashCheck
6
6
 
7
- # name: Sentiment140
8
- # web: http://help.sentiment140.com/for-students/
9
-
10
- # description: |
11
-
12
7
 
13
8
  @zipdownloader(
14
9
  "dir",
@@ -11,6 +11,7 @@ from datamaestro.download.archive import zipdownloader
11
11
  from datamaestro.download.single import filedownloader
12
12
  from datamaestro_text.data.embeddings import WordEmbeddingsText
13
13
 
14
+
14
15
  # size: 822M
15
16
  # statistics:
16
17
  # tokens: 6G
@@ -1,10 +1,9 @@
1
1
  """The AQUAINT Corpus, Linguistic Data Consortium (LDC) catalog number LDC2002T31 and ISBN 1-58563-240-6 consists of newswire text data in English, drawn from three sources: the Xinhua News Service (People's Republic of China), the New York Times News Service, and the Associated Press Worldstream News Service. It was prepared by the LDC for the AQUAINT Project, and will be used in official benchmark evaluations conducted by National Institute of Standards and Technology (NIST)."""
2
2
 
3
- from datamaestro.definitions import DatafolderPath
4
- from datamaestro.data import Base
5
- from datamaestro_text.data.ir.trec import TipsterCollection
6
- from datamaestro.definitions import argument, datatasks, datatags, dataset
3
+ from datamaestro.context import DatafolderPath
4
+ from datamaestro.definitions import dataset
7
5
  from datamaestro.download.links import links, linkfolder
6
+ from datamaestro_text.data.ir.trec import TipsterCollection
8
7
 
9
8
 
10
9
  URL = "https://catalog.ldc.upenn.edu/LDC2002T31"
@@ -22,8 +22,8 @@ from datamaestro_text.data.ir.trec import TipsterCollection
22
22
  from datamaestro.download.links import linkfolder
23
23
  from datamaestro.definitions import (
24
24
  dataset,
25
- DatafolderPath,
26
25
  )
26
+ from datamaestro.context import DatafolderPath
27
27
 
28
28
  # Store meta-information
29
29
  TIPSTER = dataset(TipsterCollection, url="https://catalog.ldc.upenn.edu/LDC93T3A")
@@ -129,6 +129,8 @@ class ConversationNode:
129
129
 
130
130
 
131
131
  class ConversationTree(ABC):
132
+ """Represents a conversation tree"""
133
+
132
134
  @abstractmethod
133
135
  def root(self) -> ConversationNode:
134
136
  ...
@@ -192,7 +194,7 @@ class SingleConversationTreeNode(ConversationNode):
192
194
  return (
193
195
  SingleConversationTreeNode(self.tree, self.index + 1)
194
196
  if self.index < len(self.tree.history) - 1
195
- else []
197
+ else None
196
198
  )
197
199
 
198
200
  def children(self) -> List[ConversationNode]:
@@ -207,8 +209,8 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
207
209
  """A conversation tree node"""
208
210
 
209
211
  entry: Record
210
- parent: Optional["ConversationTreeNode"]
211
- children: List["ConversationTreeNode"]
212
+ _parent: Optional["ConversationTreeNode"]
213
+ _children: List["ConversationTreeNode"]
212
214
 
213
215
  def __init__(self, entry):
214
216
  self.entry = entry
@@ -216,8 +218,8 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
216
218
  self.children = []
217
219
 
218
220
  def add(self, node: "ConversationTreeNode") -> "ConversationTreeNode":
219
- self.children.append(node)
220
- node.parent = self
221
+ self._children.append(node)
222
+ node._parent = self
221
223
  return node
222
224
 
223
225
  def conversation(self, skip_self: bool) -> ConversationHistory:
@@ -232,14 +234,14 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
232
234
  def __iter__(self) -> Iterator["ConversationTreeNode"]:
233
235
  """Iterates over all conversation tree nodes (pre-order)"""
234
236
  yield self.entry
235
- for child in self.children:
237
+ for child in self._children:
236
238
  yield from child
237
239
 
238
240
  def parent(self) -> Optional[ConversationNode]:
239
- return self.parent
241
+ return self._parent
240
242
 
241
243
  def children(self) -> List[ConversationNode]:
242
- return self.children
244
+ return self._children
243
245
 
244
246
  def root(self):
245
247
  return self
@@ -0,0 +1,107 @@
1
+ from typing import Iterator, List
2
+ from attr import define
3
+ import json
4
+ from datamaestro.record import Record
5
+ from datamaestro.data import File
6
+ from datamaestro_text.data.conversation.base import (
7
+ ConversationDataset,
8
+ ConversationTree,
9
+ SingleConversationTree,
10
+ SimpleDecontextualizedItem,
11
+ EntryType,
12
+ )
13
+ from datamaestro_text.data.ir import IDItem, SimpleTextItem
14
+ import logging
15
+
16
+
17
+ @define(kw_only=True)
18
+ class CanardConversation:
19
+ """A query with past history"""
20
+
21
+ history: List[str]
22
+ """The list of queries asked by the user"""
23
+
24
+ query: str
25
+ """The last issued query"""
26
+
27
+ rewrite: str
28
+ """Manually rewritten query"""
29
+
30
+ dialogue_id: str
31
+ """Conversation identifier"""
32
+
33
+ query_no: int
34
+ """Question number"""
35
+
36
+
37
+ class CanardDataset(ConversationDataset, File):
38
+ """A dataset in the CANARD JSON format
39
+
40
+ The CANARD dataset is composed of
41
+ """
42
+
43
+ def entries(self) -> Iterator[CanardConversation]:
44
+ """Iterates over re-written query with their context"""
45
+ with self.path.open("rt") as fp:
46
+ data = json.load(fp)
47
+
48
+ for entry in data:
49
+ yield CanardConversation(
50
+ history=entry["History"],
51
+ query=entry["Question"],
52
+ rewrite=entry["Rewrite"],
53
+ dialogue_id=entry["QuAC_dialog_id"],
54
+ query_no=entry["Question_no"],
55
+ )
56
+
57
+ def __iter__(self) -> Iterator[ConversationTree]:
58
+ history: list[Record] = []
59
+ current_id = None
60
+
61
+ for entry in self.entries():
62
+ # Check if current conversation, otherwise we are OK
63
+ if current_id != entry.dialogue_id:
64
+ if current_id is not None:
65
+ history.reverse()
66
+ yield SingleConversationTree(current_id, history)
67
+ history = []
68
+
69
+ current_id = entry.dialogue_id
70
+
71
+ if not history:
72
+ # First round
73
+ # The two first items are the wikipedia title and section,
74
+ # we interpret them as two user queries
75
+ assert len(entry.history) == 2
76
+ history.extend(
77
+ Record(
78
+ SimpleTextItem(text),
79
+ EntryType.USER_QUERY,
80
+ )
81
+ for text in entry.history
82
+ )
83
+ else:
84
+ # The utterance before the last is the last user query
85
+ assert (
86
+ entry.history[-2] == history[-1][SimpleTextItem].text
87
+ ), f"{entry.dialogue_id} {entry.history} / {history[-4:-1]}"
88
+
89
+ # The last utterance is the system side
90
+ history.append(
91
+ Record(SimpleTextItem(entry.history[-1]), EntryType.SYSTEM_ANSWER)
92
+ )
93
+
94
+ assert len(entry.history) == len(history)
95
+
96
+ # Add to current
97
+ history.append(
98
+ Record(
99
+ IDItem(f"{entry.dialogue_id}-{entry.query_no}"),
100
+ SimpleTextItem(entry.query),
101
+ SimpleDecontextualizedItem(entry.rewrite),
102
+ EntryType.USER_QUERY,
103
+ )
104
+ )
105
+
106
+ if current_id:
107
+ yield SingleConversationTree(current_id, history)
@@ -1,4 +1,3 @@
1
- from functools import cached_property
2
1
  from typing import Iterator, List, Optional
3
2
  from attr import define
4
3
  import json
@@ -1,5 +1,5 @@
1
- from pathlib import Path
2
- from datamaestro.data import Base, File, argument
1
+ from experimaestro import Meta
2
+ from datamaestro.data import Base, File
3
3
  from datamaestro.definitions import datatags
4
4
  import numpy as np
5
5
  from typing import Tuple, List
@@ -18,9 +18,9 @@ class WordEmbeddings(Base):
18
18
  raise NotImplementedError()
19
19
 
20
20
 
21
- @argument("encoding", str, ignored=True, default="utf-8")
22
21
  class WordEmbeddingsText(WordEmbeddings, File):
23
22
  """Word embeddings as a text word / values"""
23
+ encoding: Meta[str] = "utf-8"
24
24
 
25
25
  def load(self):
26
26
  words = []
@@ -43,6 +43,12 @@ class IDItem(Item, ABC):
43
43
 
44
44
  id: str
45
45
 
46
+ @define
47
+ class UrlItem(Item):
48
+ """An url item"""
49
+
50
+ url: str
51
+
46
52
 
47
53
  @define
48
54
  class AdhocAssessment:
@@ -1,7 +1,8 @@
1
1
  from csv import DictReader
2
2
  from typing import Iterator
3
3
 
4
- from datamaestro.data import File, documentation
4
+ from experimaestro import documentation
5
+ from datamaestro.data import File
5
6
  from datamaestro.record import Record
6
7
  from datamaestro_text.data.ir import Documents, TopicRecord, Topics, IDItem
7
8
  from datamaestro_text.data.ir.formats import (
@@ -58,7 +58,7 @@ class NFCorpusDocument(TextItem):
58
58
 
59
59
  @cached_property
60
60
  def text(self):
61
- return self.abstract
61
+ return f"{self.title} {self.abstract}"
62
62
 
63
63
 
64
64
  @define
@@ -99,8 +99,8 @@ class WapoDocument(TextItem):
99
99
  body_media: Tuple[WapoDocMedia, ...]
100
100
 
101
101
  @cached_property
102
- def text(self):
103
- return self.body
102
+ def text(self):
103
+ return f"{self.title} {self.body_paras_html}"
104
104
 
105
105
 
106
106
  @define
@@ -126,13 +126,24 @@ class OrConvQADocument(TextItem):
126
126
  def text(self):
127
127
  return f"{self.title} {self.body}"
128
128
 
129
+
129
130
  @define
131
+ class DprW100Doc(TextItem):
132
+ text: str
133
+ title: str
134
+
135
+ @define
136
+ class MsMarcoV2Passage(TextItem):
137
+ text: str
138
+ spans: Tuple[Tuple[int, int], ...]
139
+ msmarco_document_id: str
130
140
  class Touche2020(TextItem):
131
141
  text: str
132
142
  title: str
133
143
  stance: str
134
144
  url: str
135
145
 
146
+
136
147
  @define
137
148
  class SciDocs(TextItem):
138
149
  text: str
@@ -175,7 +186,8 @@ class TrecMb14Query(TextItem):
175
186
  def get_text(self):
176
187
  return f"{self.query}"
177
188
 
178
- @define
189
+
190
+ @define
179
191
  class SciDocsTopic(TextItem):
180
192
  text: str
181
193
  authors: List[str]
@@ -183,6 +195,7 @@ class SciDocsTopic(TextItem):
183
195
  cited_by: List[str]
184
196
  references: List[str]
185
197
 
198
+
186
199
  @define()
187
200
  class TrecTopic(SimpleTextItem):
188
201
  description: str
@@ -190,3 +203,17 @@ class TrecTopic(SimpleTextItem):
190
203
 
191
204
 
192
205
  TrecTopicRecord = record_type(IDItem, TrecTopic)
206
+
207
+ @define
208
+ class DprW100Query(TextItem):
209
+ text: str
210
+ answers: Tuple[str]
211
+
212
+ @define
213
+ class TrecBackgroundLinkingQuery(IDItem):
214
+ query_id: str
215
+ doc_id: str
216
+ url: str
217
+
218
+ def get_text(self):
219
+ raise NotImplementedError()
@@ -0,0 +1,13 @@
1
+ from experimaestro import Param
2
+ from datamaestro.data import Base, File
3
+ import datamaestro.data.csv as csv
4
+
5
+
6
+ class RatedItems(Base):
7
+ ratings: Param[File]
8
+
9
+
10
+ class Movielens(RatedItems):
11
+ links: Param[csv.Generic]
12
+ movies: Param[csv.Generic]
13
+ tags: Param[csv.Generic]
@@ -1,15 +1,15 @@
1
- from pathlib import Path
2
- from datamaestro.data import Base, Folder, File, argument
1
+ from typing import Optional
2
+ from experimaestro import Param
3
+ from datamaestro.data import Base, Folder, File
3
4
  from datamaestro.data.ml import Supervised
4
5
 
5
6
 
6
- @argument("train", type=Base)
7
- @argument("test", type=Base, required=False)
8
- @argument("validation", type=Base, required=False)
9
7
  class TrainingText(Supervised):
10
8
  """ "A dataset used for training with a train and a test"""
11
9
 
12
- pass
10
+ train: Param[Base]
11
+ test: Param[Optional[Base]] = None
12
+ validation: Param[Optional[Base]] = None
13
13
 
14
14
 
15
15
  class TextFolder(Folder):