datamaestro-text 2025.6.11__tar.gz → 2025.7.28__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/.github/workflows/pytest.yml +1 -1
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/.readthedocs.yml +1 -1
- {datamaestro_text-2025.6.11/src/datamaestro_text.egg-info → datamaestro_text-2025.7.28}/PKG-INFO +3 -3
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/api/conversation.rst +24 -2
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/api/ir.rst +8 -2
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/pyproject.toml +9 -1
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/requirements.txt +1 -1
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/github/apple/ml-qrecc.py +3 -3
- datamaestro_text-2025.7.28/src/datamaestro_text/config/com/github/ikat.py +121 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/sentiment140.py +4 -4
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/trec/tipster.py +0 -1
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/conversation/base.py +38 -13
- datamaestro_text-2025.7.28/src/datamaestro_text/data/conversation/ikat.py +145 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/__init__.py +25 -2
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/base.py +2 -1
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/formats.py +8 -2
- datamaestro_text-2025.7.28/src/datamaestro_text/data/ir/stores.py +124 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/trec.py +7 -4
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/datasets/irds/data.py +34 -15
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/interfaces/trec.py +28 -1
- datamaestro_text-2025.7.28/src/datamaestro_text/utils/files.py +111 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/utils/iter.py +5 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/version.py +2 -2
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28/src/datamaestro_text.egg-info}/PKG-INFO +3 -3
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text.egg-info/SOURCES.txt +2 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text.egg-info/requires.txt +1 -1
- datamaestro_text-2025.6.11/src/datamaestro_text/data/ir/stores.py +0 -29
- datamaestro_text-2025.6.11/src/datamaestro_text/utils/files.py +0 -8
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/.circleci/config.yml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/.flake8 +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/.github/workflows/python-publish.yml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/.gitignore +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/.pre-commit-config.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/LICENSE +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/MANIFEST.in +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/Makefile +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/README.md +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/Makefile +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/make.bat +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/requirements.txt +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/api/embeddings.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/api/index.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/api/nlp.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/api/recommendation.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/api/text.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/conf.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/datasets/conversation.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/datasets/embeddings.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/datasets/index.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/datasets/ir.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/datasets/irds.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/datasets/recommendation.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/datasets/text.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/index.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/mkdocs.yml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/requirements-dev.txt +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/setup.cfg +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/ai/quac.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/github/aagohary/canard.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/github/prdwb/orconvqa.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/github/soskek/bookcorpus.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/microsoft/wikiqa.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/oscar-corpus.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/smashwords/bookcorpus.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/edu/cornell/nlvr.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/edu/stanford/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/edu/stanford/aclimdb.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/edu/stanford/glove.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/edu/stanford/im2p.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/edu/upenn/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/edu/upenn/ldc/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/ir/covid.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/trec/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/trec/adhoc.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/trec/clueweb.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/trec/deeplearning.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/trec/index.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/trec/web.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/io/github/rajpurkar/squad.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/io/github/thunlp/fewrel.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/io/metamind/research/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/io/metamind/research/wikitext.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/net/mattmahoney/enwiki.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/org/acm/recsys/cb2014.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/org/cocodataset/index.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/org/grouplens/movielens.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/org/universaldependencies/french.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/conversation/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/conversation/canard.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/conversation/orconvqa.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/conversation/qrecc.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/embeddings.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/cord19.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/csv.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/data.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/huggingface.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/utils.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/recommendation.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/tagging.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/text.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/datasets/irds/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/datasets/irds/datasets.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/datasets/irds/helpers.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/datasets/irds/utils.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/download/tmdb.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/interfaces/plaintext.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/test/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/test/test_datasets.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/test/test_documented.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/transforms/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/transforms/ir/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/utils/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/utils/randomstream.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/utils/shuffle.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text.egg-info/dependency_links.txt +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text.egg-info/entry_points.txt +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text.egg-info/top_level.txt +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/tox.ini +0 -0
|
@@ -25,7 +25,7 @@ jobs:
|
|
|
25
25
|
python-version: ${{ matrix.python-version }}
|
|
26
26
|
- name: Install dependencies
|
|
27
27
|
run: |
|
|
28
|
-
python -m pip install --upgrade pip
|
|
28
|
+
python -m pip install --upgrade pip setuptools
|
|
29
29
|
SETUPTOOLS_SCM_PRETEND_VERSION="0.1-dev" pip install -e '.[dev]'
|
|
30
30
|
- name: Lint with flake8
|
|
31
31
|
run: |
|
{datamaestro_text-2025.6.11/src/datamaestro_text.egg-info → datamaestro_text-2025.7.28}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datamaestro-text
|
|
3
|
-
Version: 2025.
|
|
3
|
+
Version: 2025.7.28
|
|
4
4
|
Summary: Datamaestro module for text-related datasets
|
|
5
5
|
Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
|
|
6
6
|
License: GPL-3
|
|
@@ -15,10 +15,10 @@ Classifier: Operating System :: OS Independent
|
|
|
15
15
|
Classifier: Programming Language :: Python
|
|
16
16
|
Classifier: Programming Language :: Python :: 3
|
|
17
17
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
18
|
-
Requires-Python: >=3.
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
19
|
Description-Content-Type: text/markdown
|
|
20
20
|
License-File: LICENSE
|
|
21
|
-
Requires-Dist: datamaestro>=1.
|
|
21
|
+
Requires-Dist: datamaestro>=1.5.0
|
|
22
22
|
Requires-Dist: ir_datasets>=0.5.8
|
|
23
23
|
Requires-Dist: attrs
|
|
24
24
|
Provides-Extra: dev
|
|
@@ -26,6 +26,12 @@ Data classes
|
|
|
26
26
|
|
|
27
27
|
.. autoclass:: ConversationTopic
|
|
28
28
|
|
|
29
|
+
Conversational IR
|
|
30
|
+
-----------------
|
|
31
|
+
|
|
32
|
+
.. autoxpmconfig:: datamaestro_text.data.conversation.base.ConversationUserTopics
|
|
33
|
+
|
|
34
|
+
|
|
29
35
|
Contextual query reformulation
|
|
30
36
|
------------------------------
|
|
31
37
|
|
|
@@ -34,9 +40,13 @@ Contextual query reformulation
|
|
|
34
40
|
.. autoclass:: ContextualizedRewrittenQuery
|
|
35
41
|
:members:
|
|
36
42
|
|
|
43
|
+
CANARD Dataset
|
|
44
|
+
|
|
37
45
|
.. autoxpmconfig:: datamaestro_text.data.conversation.canard.CanardDataset
|
|
38
46
|
:members: iter
|
|
39
47
|
|
|
48
|
+
OrConvQA Dataset
|
|
49
|
+
|
|
40
50
|
.. autoxpmconfig:: datamaestro_text.data.conversation.orconvqa.OrConvQADataset
|
|
41
51
|
:members: iter
|
|
42
52
|
|
|
@@ -46,10 +56,22 @@ Contextual query reformulation
|
|
|
46
56
|
.. autoclass:: datamaestro_text.data.conversation.orconvqa.OrConvQADatasetHistoryEntry
|
|
47
57
|
:members:
|
|
48
58
|
|
|
59
|
+
QReCC Dataset
|
|
49
60
|
|
|
50
|
-
|
|
51
|
-
.. autoclass:: datamaestro_text.data.conversation.orconvqa.QReCCDatasetEntry
|
|
61
|
+
.. autoclass:: datamaestro_text.data.conversation.qrecc.QReCCDatasetEntry
|
|
52
62
|
:members:
|
|
53
63
|
|
|
54
64
|
.. autoxpmconfig:: datamaestro_text.data.conversation.qrecc.QReCCDataset
|
|
55
65
|
:members: iter
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
iKAT Dataset
|
|
69
|
+
|
|
70
|
+
.. autoclass:: datamaestro_text.data.conversation.ikat.IkatConversationTopic
|
|
71
|
+
:members:
|
|
72
|
+
|
|
73
|
+
.. autoclass:: datamaestro_text.data.conversation.ikat.IkatConversationEntry
|
|
74
|
+
:members:
|
|
75
|
+
|
|
76
|
+
.. autoxpmconfig:: datamaestro_text.data.conversation.ikat.IkatConversations
|
|
77
|
+
:members: iter
|
|
@@ -36,11 +36,17 @@ Documents
|
|
|
36
36
|
|
|
37
37
|
.. autoxpmconfig:: datamaestro_text.data.ir.Documents
|
|
38
38
|
:members: iter_documents, iter_ids, documentcount
|
|
39
|
+
.. autoxpmconfig:: datamaestro_text.data.ir.csv.Documents
|
|
40
|
+
.. autoxpmconfig:: datamaestro_text.datasets.irds.data.LZ4DocumentStore
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
Dataset-specific documents
|
|
44
|
+
**************************
|
|
45
|
+
|
|
39
46
|
.. autoxpmconfig:: datamaestro_text.data.ir.cord19.Documents
|
|
40
47
|
.. autoxpmconfig:: datamaestro_text.data.ir.trec.TipsterCollection
|
|
41
|
-
.. autoxpmconfig:: datamaestro_text.data.ir.csv.Documents
|
|
42
48
|
.. autoxpmconfig:: datamaestro_text.data.ir.stores.OrConvQADocumentStore
|
|
43
|
-
.. autoxpmconfig:: datamaestro_text.
|
|
49
|
+
.. autoxpmconfig:: datamaestro_text.data.ir.stores.IKatClueWeb22DocumentStore
|
|
44
50
|
|
|
45
51
|
Assessments
|
|
46
52
|
-----------
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "datamaestro-text"
|
|
3
|
-
requires-python = ">=3.
|
|
3
|
+
requires-python = ">=3.10"
|
|
4
4
|
keywords = ["dataset manager", "information retrieval", "experiments"]
|
|
5
5
|
description = "Datamaestro module for text-related datasets"
|
|
6
6
|
dynamic = ["version", "readme", "dependencies"]
|
|
@@ -44,6 +44,14 @@ fallback_version = "0.0.0-dev"
|
|
|
44
44
|
[build-system]
|
|
45
45
|
requires = ["setuptools", "setuptools-scm", "wheel"] # PEP 508 specifications.
|
|
46
46
|
|
|
47
|
+
[dependency-groups]
|
|
48
|
+
dev = [
|
|
49
|
+
"docutils>=0.21.2",
|
|
50
|
+
"pytest>=8.4.1",
|
|
51
|
+
"sphinx>=8.1.3",
|
|
52
|
+
"sphobjinv>=2.3.1.3",
|
|
53
|
+
]
|
|
54
|
+
|
|
47
55
|
[project.entry-points."datamaestro.repositories"]
|
|
48
56
|
text = "datamaestro_text:Repository"
|
|
49
57
|
irds = "datamaestro_text.datasets.irds:Repository"
|
|
@@ -39,9 +39,9 @@ def main(data: Path) -> Supervised[QReCCDataset, None, QReCCDataset]:
|
|
|
39
39
|
answering that includes the individual subtasks of question rewriting,
|
|
40
40
|
passage retrieval and reading comprehension
|
|
41
41
|
"""
|
|
42
|
-
return Supervised(
|
|
43
|
-
train=QReCCDataset(path=data / "qrecc_train.json"),
|
|
44
|
-
test=QReCCDataset(path=data / "qrecc_test.json"),
|
|
42
|
+
return Supervised.C(
|
|
43
|
+
train=QReCCDataset.C(path=data / "qrecc_train.json"),
|
|
44
|
+
test=QReCCDataset.C(path=data / "qrecc_test.json"),
|
|
45
45
|
)
|
|
46
46
|
|
|
47
47
|
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# See documentation on https://datamaestro.readthedocs.io
|
|
2
|
+
|
|
3
|
+
import bz2
|
|
4
|
+
from datamaestro.download import reference
|
|
5
|
+
from datamaestro.definitions import datatasks, datatags, dataset
|
|
6
|
+
from datamaestro_text.data.conversation.base import ConversationUserTopics
|
|
7
|
+
from datamaestro_text.data.ir import Adhoc
|
|
8
|
+
|
|
9
|
+
from datamaestro.utils import HashCheck
|
|
10
|
+
from datamaestro.context import DatafolderPath
|
|
11
|
+
from datamaestro.download.single import filedownloader
|
|
12
|
+
from datamaestro_text.data.conversation.ikat import IkatConversations
|
|
13
|
+
from datamaestro.download.links import linkfolder
|
|
14
|
+
|
|
15
|
+
from datamaestro_text.data.ir.stores import IKatClueWeb22DocumentStore
|
|
16
|
+
from datamaestro_text.data.ir.trec import TrecAdhocAssessments
|
|
17
|
+
from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataset(as_prepare=True)
|
|
21
|
+
def clueweb22(dataset, options=None) -> IKatClueWeb22DocumentStore:
|
|
22
|
+
# Number of documents in the dataset
|
|
23
|
+
count = 116_838_987
|
|
24
|
+
|
|
25
|
+
jsonl_folder = linkfolder(
|
|
26
|
+
"documents", [DatafolderPath("gov.nist.trec.ikat.clueweb22", "jsonl")]
|
|
27
|
+
).setup(dataset, options)
|
|
28
|
+
store_path = lz4docstore_builder(
|
|
29
|
+
"store",
|
|
30
|
+
IKatClueWeb22DocumentStore.generator(
|
|
31
|
+
jsonl_folder,
|
|
32
|
+
jsonl_folder / "ikat_2023_passages_jsonl.sha256sums",
|
|
33
|
+
jsonl_folder / "ikat_2023_passages_hashes.tsv.bz2",
|
|
34
|
+
),
|
|
35
|
+
IKatClueWeb22DocumentStore.Document,
|
|
36
|
+
"id",
|
|
37
|
+
count_hint=count,
|
|
38
|
+
).setup(dataset, options)
|
|
39
|
+
|
|
40
|
+
return IKatClueWeb22DocumentStore.C(path=store_path, count=count)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@datatags("conversation", "context", "query")
|
|
44
|
+
@datatasks("conversational search", "query rewriting")
|
|
45
|
+
@reference("documents", clueweb22)
|
|
46
|
+
@filedownloader(
|
|
47
|
+
"topics.json",
|
|
48
|
+
"https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2025/data/2025_test_topics.json",
|
|
49
|
+
checker=HashCheck("16f8444a8d0a8dfe0090f478f185a63c"),
|
|
50
|
+
)
|
|
51
|
+
@dataset(
|
|
52
|
+
id="2025",
|
|
53
|
+
url="https://github.com/irlabamsterdam/iKAT/tree/main/2025",
|
|
54
|
+
)
|
|
55
|
+
def test_2025(topics, documents) -> Adhoc.C:
|
|
56
|
+
"""Question-in-context rewriting
|
|
57
|
+
|
|
58
|
+
iKAT is a test dataset for question-in-context rewriting that consists of
|
|
59
|
+
questions each given in a dialog context together with a context-independent
|
|
60
|
+
rewriting of the question.
|
|
61
|
+
"""
|
|
62
|
+
return Adhoc.C(
|
|
63
|
+
topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
|
|
64
|
+
# TODO: add when available
|
|
65
|
+
assessments=TrecAdhocAssessments.C(path="/to/do"),
|
|
66
|
+
documents=documents,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@datatags("conversation", "context", "query")
|
|
71
|
+
@datatasks("conversational search", "query rewriting")
|
|
72
|
+
@reference("documents", clueweb22)
|
|
73
|
+
@filedownloader(
|
|
74
|
+
"qrels",
|
|
75
|
+
"https://trec.nist.gov/data/ikat/2024-qrels.txt",
|
|
76
|
+
checker=HashCheck("57f958903ed1c12bbac207f62800814f"),
|
|
77
|
+
)
|
|
78
|
+
@filedownloader(
|
|
79
|
+
"topics.json",
|
|
80
|
+
"https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2024/data/2024_test_topics.json",
|
|
81
|
+
checker=HashCheck("ad45bc6e7add2081d69ea60a0a4d1203"),
|
|
82
|
+
)
|
|
83
|
+
@dataset(
|
|
84
|
+
Adhoc,
|
|
85
|
+
id="2024",
|
|
86
|
+
url="https://github.com/irlabamsterdam/iKAT/tree/main/2024",
|
|
87
|
+
)
|
|
88
|
+
def test_2024(topics, qrels, documents) -> Adhoc.C:
|
|
89
|
+
"""iKAT 2024 dataset"""
|
|
90
|
+
return Adhoc.C(
|
|
91
|
+
topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
|
|
92
|
+
assessments=TrecAdhocAssessments.C(path=qrels),
|
|
93
|
+
documents=documents,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@datatags("conversation", "context", "query")
|
|
98
|
+
@datatasks("conversational search", "query rewriting")
|
|
99
|
+
@reference("documents", clueweb22)
|
|
100
|
+
@filedownloader(
|
|
101
|
+
"qrels",
|
|
102
|
+
"https://trec.nist.gov/data/ikat/2023-qrels.all-turns.txt",
|
|
103
|
+
checker=HashCheck("79dc121bab25b2245e52a53263e5ad1f"),
|
|
104
|
+
)
|
|
105
|
+
@filedownloader(
|
|
106
|
+
"topics.json",
|
|
107
|
+
"https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2023/data/2023_test_topics.json",
|
|
108
|
+
checker=HashCheck("684fa0197cdec8c3cfb6a2e586ab83f6"),
|
|
109
|
+
)
|
|
110
|
+
@dataset(
|
|
111
|
+
Adhoc,
|
|
112
|
+
id="2023",
|
|
113
|
+
url="https://github.com/irlabamsterdam/iKAT/tree/main/2023",
|
|
114
|
+
)
|
|
115
|
+
def test_2023(topics, qrels, documents) -> Adhoc.C:
|
|
116
|
+
"""iKAT 2023 dataset"""
|
|
117
|
+
return Adhoc.C(
|
|
118
|
+
topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
|
|
119
|
+
assessments=TrecAdhocAssessments.C(path=qrels),
|
|
120
|
+
documents=documents,
|
|
121
|
+
)
|
|
@@ -26,7 +26,7 @@ def english(dir):
|
|
|
26
26
|
|
|
27
27
|
If you use this data, please cite Sentiment140 as your source.
|
|
28
28
|
"""
|
|
29
|
-
return
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
29
|
+
return Supervised.C(
|
|
30
|
+
train=Generic(path=dir / "training.1600000.processed.noemoticon.csv"),
|
|
31
|
+
test=Generic(path=dir / "testdata.manual.2009.06.14.csv"),
|
|
32
|
+
)
|
|
@@ -17,7 +17,6 @@ of three CD-ROMs of SGML encoded documents distributed by LDC plus queries and a
|
|
|
17
17
|
See also https://trec.nist.gov/data/docs_eng.html and https://trec.nist.gov/data/intro_eng.html
|
|
18
18
|
"""
|
|
19
19
|
|
|
20
|
-
from datamaestro.data import Base
|
|
21
20
|
from datamaestro_text.data.ir.trec import TipsterCollection
|
|
22
21
|
from datamaestro.download.links import linkfolder
|
|
23
22
|
from datamaestro.definitions import (
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
from enum import Enum
|
|
3
|
+
from datamaestro_text.data.ir.base import IDItem, SimpleTextItem
|
|
4
|
+
from experimaestro import Param
|
|
3
5
|
from typing import Dict, Generic, Iterator, List, Optional, Sequence, Tuple
|
|
4
6
|
from attr import define
|
|
7
|
+
from datamaestro.record import record_type
|
|
5
8
|
from datamaestro.data import Base
|
|
6
9
|
from datamaestro.record import Record, Item
|
|
7
|
-
from datamaestro_text.data.ir import TopicRecord
|
|
10
|
+
from datamaestro_text.data.ir import TopicRecord, Topics
|
|
8
11
|
from datamaestro_text.utils.iter import FactoryIterable, LazyList, RangeView
|
|
9
12
|
|
|
10
13
|
# ---- Basic types
|
|
@@ -120,20 +123,17 @@ class ConversationNode:
|
|
|
120
123
|
...
|
|
121
124
|
|
|
122
125
|
@abstractmethod
|
|
123
|
-
def parent(self) -> Optional["ConversationNode"]:
|
|
124
|
-
...
|
|
126
|
+
def parent(self) -> Optional["ConversationNode"]: ...
|
|
125
127
|
|
|
126
128
|
@abstractmethod
|
|
127
|
-
def children(self) -> List["ConversationNode"]:
|
|
128
|
-
...
|
|
129
|
+
def children(self) -> List["ConversationNode"]: ...
|
|
129
130
|
|
|
130
131
|
|
|
131
132
|
class ConversationTree(ABC):
|
|
132
133
|
"""Represents a conversation tree"""
|
|
133
134
|
|
|
134
135
|
@abstractmethod
|
|
135
|
-
def root(self) -> ConversationNode:
|
|
136
|
-
...
|
|
136
|
+
def root(self) -> ConversationNode: ...
|
|
137
137
|
|
|
138
138
|
@abstractmethod
|
|
139
139
|
def __iter__(self) -> Iterator[ConversationNode]:
|
|
@@ -214,8 +214,8 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
|
|
|
214
214
|
|
|
215
215
|
def __init__(self, entry):
|
|
216
216
|
self.entry = entry
|
|
217
|
-
self.
|
|
218
|
-
self.
|
|
217
|
+
self._parent = None
|
|
218
|
+
self._children = []
|
|
219
219
|
|
|
220
220
|
def add(self, node: "ConversationTreeNode") -> "ConversationTreeNode":
|
|
221
221
|
self._children.append(node)
|
|
@@ -224,10 +224,10 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
|
|
|
224
224
|
|
|
225
225
|
def conversation(self, skip_self: bool) -> ConversationHistory:
|
|
226
226
|
def iterator():
|
|
227
|
-
current = self.parent if skip_self else self
|
|
227
|
+
current = self.parent() if skip_self else self
|
|
228
228
|
while current is not None:
|
|
229
229
|
yield current.entry
|
|
230
|
-
current = current.parent
|
|
230
|
+
current = current.parent()
|
|
231
231
|
|
|
232
232
|
return LazyList(FactoryIterable(iterator))
|
|
233
233
|
|
|
@@ -253,5 +253,30 @@ class ConversationDataset(Base, ABC):
|
|
|
253
253
|
@abstractmethod
|
|
254
254
|
def __iter__(self) -> Iterator[ConversationTree]:
|
|
255
255
|
"""Return an iterator over conversations"""
|
|
256
|
-
|
|
257
|
-
|
|
256
|
+
...
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
class ConversationUserTopics(Topics):
|
|
260
|
+
"""Extract user topics from conversations"""
|
|
261
|
+
|
|
262
|
+
conversations: Param[ConversationDataset]
|
|
263
|
+
|
|
264
|
+
topic_recordtype = record_type(IDItem, SimpleTextItem)
|
|
265
|
+
|
|
266
|
+
def iter(self) -> Iterator[TopicRecord]:
|
|
267
|
+
"""Returns an iterator over topics"""
|
|
268
|
+
# Extracts topics from conversations, Each user query is a topic (can perform retrieval on it)
|
|
269
|
+
# TODO: merge with xpmir.learning.DatasetConversationBase -> same logic
|
|
270
|
+
|
|
271
|
+
records: List[TopicRecord] = []
|
|
272
|
+
for conversation in self.conversations.__iter__():
|
|
273
|
+
nodes = [
|
|
274
|
+
node
|
|
275
|
+
for node in conversation
|
|
276
|
+
if node.entry[EntryType] == EntryType.USER_QUERY
|
|
277
|
+
]
|
|
278
|
+
for node in nodes:
|
|
279
|
+
records.append(
|
|
280
|
+
node.entry.update(ConversationHistoryItem(node.history()))
|
|
281
|
+
)
|
|
282
|
+
return iter(records)
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
from typing import Iterator, List
|
|
2
|
+
from attr import define, field
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from datamaestro.data import File
|
|
6
|
+
from datamaestro.record import Record
|
|
7
|
+
|
|
8
|
+
from datamaestro_text.data.ir import Topics
|
|
9
|
+
from datamaestro_text.data.ir.base import (
|
|
10
|
+
IDItem,
|
|
11
|
+
SimpleTextItem,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
from .base import (
|
|
16
|
+
AnswerEntry,
|
|
17
|
+
ConversationTree,
|
|
18
|
+
EntryType,
|
|
19
|
+
SimpleDecontextualizedItem,
|
|
20
|
+
SingleConversationTree,
|
|
21
|
+
)
|
|
22
|
+
from . import ConversationDataset
|
|
23
|
+
|
|
24
|
+
# Keys to change in the dataset entries for compatibility across different years
|
|
25
|
+
|
|
26
|
+
KEY_MAPPINGS = {
|
|
27
|
+
# Keys to replace: Target Key
|
|
28
|
+
"turns": "responses",
|
|
29
|
+
"utterance": "user_utterance",
|
|
30
|
+
"ptkb_provenance": "relevant_ptkbs",
|
|
31
|
+
"response_provenance": "citations",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def norm_dict(entry: dict) -> dict:
|
|
36
|
+
"""Convert keys in the entry to match the expected format."""
|
|
37
|
+
normalized = {}
|
|
38
|
+
for k, v in entry.items():
|
|
39
|
+
# Check for direct mapping, then try lowercase mapping
|
|
40
|
+
new_key = KEY_MAPPINGS.get(k) or KEY_MAPPINGS.get(k.lower()) or k.lower()
|
|
41
|
+
normalized[new_key] = v
|
|
42
|
+
return normalized
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@define(kw_only=True)
|
|
46
|
+
class IkatConversationEntry:
|
|
47
|
+
"""A query with past history"""
|
|
48
|
+
|
|
49
|
+
turn_id: int
|
|
50
|
+
"""Turn number in the conversation"""
|
|
51
|
+
|
|
52
|
+
user_utterance: str
|
|
53
|
+
"""The last issued query"""
|
|
54
|
+
|
|
55
|
+
resolved_utterance: str
|
|
56
|
+
"""Manually rewritten query"""
|
|
57
|
+
|
|
58
|
+
response: str
|
|
59
|
+
"""The system response to the query"""
|
|
60
|
+
|
|
61
|
+
relevant_ptkbs: List[str]
|
|
62
|
+
"""The list of relevant personal knowledge bases for the query"""
|
|
63
|
+
|
|
64
|
+
citations: List[str]
|
|
65
|
+
"""The list of citations for the response"""
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@define(kw_only=True)
|
|
69
|
+
class IkatConversationTopic:
|
|
70
|
+
"""A query with past history"""
|
|
71
|
+
|
|
72
|
+
number: str
|
|
73
|
+
"""Conversation ID"""
|
|
74
|
+
|
|
75
|
+
title: str
|
|
76
|
+
"""Title of the conversation"""
|
|
77
|
+
|
|
78
|
+
ptkb: str
|
|
79
|
+
"""The personal knowledge base associated with the user"""
|
|
80
|
+
|
|
81
|
+
responses: List[IkatConversationEntry] = field(
|
|
82
|
+
converter=lambda items: [
|
|
83
|
+
IkatConversationEntry(**item) if isinstance(item, dict) else item
|
|
84
|
+
for item in map(norm_dict, items)
|
|
85
|
+
]
|
|
86
|
+
)
|
|
87
|
+
"""The list of responses to the query"""
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class IkatConversations(ConversationDataset, File):
|
|
91
|
+
"""A dataset containing conversations from the IKAT project"""
|
|
92
|
+
|
|
93
|
+
"""Keys to change in the dataset entries for compatibility across different years"""
|
|
94
|
+
|
|
95
|
+
def entries(self) -> Iterator[IkatConversationTopic]:
|
|
96
|
+
"""Reads all conversation entries from the dataset file."""
|
|
97
|
+
with self.path.open("rt") as fp:
|
|
98
|
+
raw_data = json.load(fp)
|
|
99
|
+
|
|
100
|
+
logging.debug("Loaded %d entries from %s", len(raw_data), self.path)
|
|
101
|
+
logging.debug(f"raw data has keys {raw_data[0].keys()}")
|
|
102
|
+
|
|
103
|
+
for entry in raw_data:
|
|
104
|
+
try:
|
|
105
|
+
normalized_entry = norm_dict(entry)
|
|
106
|
+
yield IkatConversationTopic(**normalized_entry)
|
|
107
|
+
except Exception as e:
|
|
108
|
+
logging.warning(f"Failed to parse entry: {e}")
|
|
109
|
+
raise e
|
|
110
|
+
|
|
111
|
+
def __iter__(self) -> Iterator[ConversationTree]:
|
|
112
|
+
for entry in self.entries():
|
|
113
|
+
history: List[Record] = []
|
|
114
|
+
|
|
115
|
+
for turn in entry.responses:
|
|
116
|
+
turn: IkatConversationEntry = turn # Ensure type is correct
|
|
117
|
+
query_id = f"{entry.number}_{turn.turn_id}"
|
|
118
|
+
|
|
119
|
+
# USER QUERY record
|
|
120
|
+
history.append(
|
|
121
|
+
Record(
|
|
122
|
+
IDItem(query_id),
|
|
123
|
+
SimpleTextItem(turn.user_utterance),
|
|
124
|
+
SimpleDecontextualizedItem(turn.resolved_utterance),
|
|
125
|
+
EntryType.USER_QUERY,
|
|
126
|
+
)
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
# Build citation info (stubbed relevance to match format)
|
|
130
|
+
relevances = {}
|
|
131
|
+
if turn.relevant_ptkbs:
|
|
132
|
+
# Example: just use first as relevant (can be improved)
|
|
133
|
+
relevances[0] = (0, None) # No position info in this structure
|
|
134
|
+
|
|
135
|
+
# SYSTEM ANSWER record
|
|
136
|
+
history.append(
|
|
137
|
+
Record(
|
|
138
|
+
AnswerEntry(turn.response),
|
|
139
|
+
EntryType.SYSTEM_ANSWER,
|
|
140
|
+
)
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# Ensure reverse if needed for compatibility (optional)
|
|
144
|
+
history.reverse()
|
|
145
|
+
yield SingleConversationTree(entry.number, history)
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/__init__.py
RENAMED
|
@@ -2,9 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
4
|
from functools import cached_property
|
|
5
|
+
import logging
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from attrs import define
|
|
7
|
-
from typing import Callable, Dict, Iterator, List, Optional, Tuple, Type
|
|
8
|
+
from typing import Callable, Dict, Iterator, List, Optional, Tuple, Type, TYPE_CHECKING
|
|
8
9
|
import random
|
|
9
10
|
from experimaestro import Config
|
|
10
11
|
from datamaestro.definitions import datatasks, Param, Meta
|
|
@@ -28,6 +29,9 @@ from .base import ( # noqa: F401
|
|
|
28
29
|
AdhocAssessedTopic,
|
|
29
30
|
)
|
|
30
31
|
|
|
32
|
+
#: A adhoc run dictionary (query id -> doc id -> score)
|
|
33
|
+
AdhocRunDict = dict[str, dict[str, float]]
|
|
34
|
+
|
|
31
35
|
|
|
32
36
|
class Documents(Base):
|
|
33
37
|
"""A set of documents with identifiers
|
|
@@ -45,6 +49,22 @@ class Documents(Base):
|
|
|
45
49
|
def iter_documents(self) -> Iterator[DocumentRecord]:
|
|
46
50
|
return self.iter()
|
|
47
51
|
|
|
52
|
+
def iter_documents_from(self, start=0) -> Iterator[DocumentRecord]:
|
|
53
|
+
"""Iterate over a range of documents
|
|
54
|
+
|
|
55
|
+
Can be specialized in a subclass for faster access
|
|
56
|
+
|
|
57
|
+
:param start: The starting document, defaults to 0
|
|
58
|
+
:return: An iterator
|
|
59
|
+
"""
|
|
60
|
+
iter = self.iter()
|
|
61
|
+
if start > 0:
|
|
62
|
+
logging.info("skipping %d documents", start + 1)
|
|
63
|
+
for _ in range(start + 1):
|
|
64
|
+
next(iter)
|
|
65
|
+
|
|
66
|
+
return iter
|
|
67
|
+
|
|
48
68
|
def iter_ids(self) -> Iterator[str]:
|
|
49
69
|
"""Iterates over document ids
|
|
50
70
|
|
|
@@ -168,7 +188,10 @@ class AdhocAssessments(Base, ABC):
|
|
|
168
188
|
class AdhocRun(Base):
|
|
169
189
|
"""IR adhoc run"""
|
|
170
190
|
|
|
171
|
-
|
|
191
|
+
@abstractmethod
|
|
192
|
+
def get_dict(self) -> "AdhocRunDict":
|
|
193
|
+
"""Get the run as a dictionary query ID -> doc ID -> score"""
|
|
194
|
+
...
|
|
172
195
|
|
|
173
196
|
|
|
174
197
|
class AdhocResults(Base):
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/base.py
RENAMED
|
@@ -43,6 +43,7 @@ class IDItem(Item, ABC):
|
|
|
43
43
|
|
|
44
44
|
id: str
|
|
45
45
|
|
|
46
|
+
|
|
46
47
|
@define
|
|
47
48
|
class UrlItem(Item):
|
|
48
49
|
"""An url item"""
|
|
@@ -70,7 +71,7 @@ class AdhocAssessedTopic:
|
|
|
70
71
|
"""List of assessments for this topic"""
|
|
71
72
|
|
|
72
73
|
|
|
73
|
-
def create_record(*items: Item, id: str = None, text: str = None):
|
|
74
|
+
def create_record(*items: Item, id: str = None, text: str = None) -> Record:
|
|
74
75
|
"""Easy creation of a text/id item"""
|
|
75
76
|
extra_items = []
|
|
76
77
|
if id is not None:
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/formats.py
RENAMED
|
@@ -99,7 +99,7 @@ class WapoDocument(TextItem):
|
|
|
99
99
|
body_media: Tuple[WapoDocMedia, ...]
|
|
100
100
|
|
|
101
101
|
@cached_property
|
|
102
|
-
def text(self):
|
|
102
|
+
def text(self):
|
|
103
103
|
return f"{self.title} {self.body_paras_html}"
|
|
104
104
|
|
|
105
105
|
|
|
@@ -132,11 +132,15 @@ class DprW100Doc(TextItem):
|
|
|
132
132
|
text: str
|
|
133
133
|
title: str
|
|
134
134
|
|
|
135
|
-
|
|
135
|
+
|
|
136
|
+
@define
|
|
136
137
|
class MsMarcoV2Passage(TextItem):
|
|
137
138
|
text: str
|
|
138
139
|
spans: Tuple[Tuple[int, int], ...]
|
|
139
140
|
msmarco_document_id: str
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@define
|
|
140
144
|
class Touche2020(TextItem):
|
|
141
145
|
text: str
|
|
142
146
|
title: str
|
|
@@ -204,11 +208,13 @@ class TrecTopic(SimpleTextItem):
|
|
|
204
208
|
|
|
205
209
|
TrecTopicRecord = record_type(IDItem, TrecTopic)
|
|
206
210
|
|
|
211
|
+
|
|
207
212
|
@define
|
|
208
213
|
class DprW100Query(TextItem):
|
|
209
214
|
text: str
|
|
210
215
|
answers: Tuple[str]
|
|
211
216
|
|
|
217
|
+
|
|
212
218
|
@define
|
|
213
219
|
class TrecBackgroundLinkingQuery(IDItem):
|
|
214
220
|
query_id: str
|