datamaestro-text 2023.10.10__tar.gz → 2023.11.22__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/.github/workflows/pytest.yml +1 -1
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/.github/workflows/python-publish.yml +2 -2
- {datamaestro-text-2023.10.10/src/datamaestro_text.egg-info → datamaestro-text-2023.11.22}/PKG-INFO +3 -5
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/pyproject.toml +4 -0
- datamaestro-text-2023.11.22/setup.cfg +4 -0
- datamaestro-text-2023.11.22/src/datamaestro_text/data/ir/formats.py +189 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/datasets/irds/data.py +85 -4
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/version.py +2 -2
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22/src/datamaestro_text.egg-info}/PKG-INFO +3 -5
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text.egg-info/SOURCES.txt +0 -3
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text.egg-info/requires.txt +0 -3
- datamaestro-text-2023.10.10/setup.cfg +0 -49
- datamaestro-text-2023.10.10/setup.py +0 -9
- datamaestro-text-2023.10.10/src/datamaestro_text/data/ir/formats.py +0 -26
- datamaestro-text-2023.10.10/src/datamaestro_text.egg-info/zip-safe +0 -1
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/.circleci/config.yml +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/.flake8 +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/.gitignore +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/.pre-commit-config.yaml +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/.readthedocs.yml +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/LICENSE +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/MANIFEST.in +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/Makefile +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/README.md +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/Makefile +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/make.bat +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/requirements.txt +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/api/conversation.rst +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/api/index.rst +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/api/ir.rst +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/api/text.rst +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/conf.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/datasets/conversation.rst +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/datasets/embeddings.rst +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/datasets/index.rst +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/datasets/ir.rst +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/datasets/recommendation.rst +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/datasets/text.rst +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/index.rst +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/mkdocs.yml +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/requirements-dev.txt +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/requirements.txt +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/__init__.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/__init__.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/ai/quac.yaml +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/com/github/aagohary/canard.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/com/github/prdwb/orconvqa.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/com/github/soskek/bookcorpus.yaml +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/com/microsoft/wikiqa.yaml +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/com/oscar-corpus.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/com/sentiment140.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/com/smashwords/bookcorpus.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/edu/cornell/nlvr.yaml +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/edu/stanford/__init__.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/edu/stanford/aclimdb.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/edu/stanford/glove.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/edu/stanford/im2p.yaml +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/edu/upenn/__init__.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/edu/upenn/ldc/__init__.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/gov/__init__.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/gov/nist/__init__.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/gov/nist/ir/covid.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/gov/nist/trec/__init__.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/gov/nist/trec/adhoc.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/gov/nist/trec/clueweb.yaml +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/gov/nist/trec/deeplearning.yaml +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/gov/nist/trec/index.yaml +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/gov/nist/trec/tipster.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/gov/nist/trec/web.yaml +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/io/github/rajpurkar/squad.yaml +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/io/github/thunlp/fewrel.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/io/metamind/research/__init__.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/io/metamind/research/wikitext.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/net/mattmahoney/enwiki.yaml +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/org/acm/recsys/cb2014.yaml +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/org/cocodataset/index.yaml +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/org/grouplens/movielens.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/org/universaldependencies/french.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/conversation/__init__.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/conversation/base.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/conversation/canard.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/conversation/orconvqa.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/embeddings.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/ir/__init__.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/ir/base.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/ir/cord19.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/ir/csv.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/ir/data.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/ir/huggingface.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/ir/trec.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/ir/utils.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/recommendation.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/tagging.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/text.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/datasets/irds/__init__.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/datasets/irds/datasets.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/datasets/irds/utils.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/download/tmdb.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/interfaces/plaintext.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/interfaces/trec.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/test/__init__.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/test/test_datasets.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/transforms/__init__.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/transforms/ir/__init__.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/utils/__init__.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/utils/files.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/utils/iter.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/utils/randomstream.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/utils/shuffle.py +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text.egg-info/dependency_links.txt +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text.egg-info/entry_points.txt +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text.egg-info/top_level.txt +0 -0
- {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/tox.ini +0 -0
{datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/.github/workflows/python-publish.yml
RENAMED
|
@@ -20,11 +20,11 @@ jobs:
|
|
|
20
20
|
- name: Install dependencies
|
|
21
21
|
run: |
|
|
22
22
|
python -m pip install --upgrade pip
|
|
23
|
-
pip install setuptools wheel twine
|
|
23
|
+
pip install setuptools wheel twine build
|
|
24
24
|
- name: Build and publish
|
|
25
25
|
env:
|
|
26
26
|
TWINE_USERNAME: __token__
|
|
27
27
|
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
|
|
28
28
|
run: |
|
|
29
|
-
python
|
|
29
|
+
python -m build --sdist --wheel
|
|
30
30
|
twine upload dist/*
|
{datamaestro-text-2023.10.10/src/datamaestro_text.egg-info → datamaestro-text-2023.11.22}/PKG-INFO
RENAMED
|
@@ -1,16 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datamaestro-text
|
|
3
|
-
Version: 2023.
|
|
3
|
+
Version: 2023.11.22
|
|
4
4
|
Summary: Datamaestro module for text-related datasets
|
|
5
|
-
Home-page: https://github.com/experimaestro/datamaestro_text
|
|
6
|
-
Author: Benjamin Piwowarski
|
|
7
5
|
Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
|
|
8
6
|
License: GPL-3
|
|
9
7
|
Project-URL: homepage, https://github.com/experimaestro/datamaestro_text
|
|
10
8
|
Project-URL: documentation, https://datamaestro-text.readthedocs.io/en/latest/
|
|
11
9
|
Project-URL: repository, https://github.com/experimaestro/datamaestro_text
|
|
12
10
|
Keywords: dataset manager,information retrieval,experiments
|
|
13
|
-
Platform: any
|
|
14
11
|
Classifier: Development Status :: 4 - Beta
|
|
15
12
|
Classifier: Intended Audience :: Science/Research
|
|
16
13
|
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
@@ -20,8 +17,9 @@ Classifier: Programming Language :: Python :: 3
|
|
|
20
17
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
18
|
Requires-Python: >=3.8
|
|
22
19
|
Description-Content-Type: text/markdown
|
|
23
|
-
Provides-Extra: test
|
|
24
20
|
License-File: LICENSE
|
|
21
|
+
Requires-Dist: datamaestro>=0.8.16
|
|
22
|
+
Requires-Dist: attrs
|
|
25
23
|
|
|
26
24
|
[](https://github.com/pre-commit/pre-commit) [](https://badge.fury.io/py/datamaestro-text)
|
|
27
25
|
|
|
@@ -18,6 +18,10 @@ classifiers = [
|
|
|
18
18
|
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
19
19
|
]
|
|
20
20
|
|
|
21
|
+
[tool.setuptools.dynamic]
|
|
22
|
+
dependencies = {file = ["requirements.txt"]}
|
|
23
|
+
readme = {file = ["README.md"], content-type = "text/markdown"}
|
|
24
|
+
|
|
21
25
|
[project.urls]
|
|
22
26
|
homepage = "https://github.com/experimaestro/datamaestro_text"
|
|
23
27
|
documentation = "https://datamaestro-text.readthedocs.io/en/latest/"
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
from typing import ClassVar, Tuple
|
|
2
|
+
from attrs import define
|
|
3
|
+
from ir_datasets.datasets.wapo import WapoDocMedia
|
|
4
|
+
from .base import IDHolder, Document, GenericTopic, IDTopic
|
|
5
|
+
from ir_datasets.datasets.cord19 import Cord19FullTextSection
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@define
|
|
9
|
+
class CordDocument(IDHolder, Document):
|
|
10
|
+
text: str
|
|
11
|
+
title: str
|
|
12
|
+
url: str
|
|
13
|
+
pubmed_id: str
|
|
14
|
+
|
|
15
|
+
has_text: ClassVar[bool] = True
|
|
16
|
+
|
|
17
|
+
def get_text(self):
|
|
18
|
+
return f"{self.title} {self.text}"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@define
|
|
22
|
+
class DocumentWithTitle(IDHolder, Document):
|
|
23
|
+
"""Web document with title and URL"""
|
|
24
|
+
|
|
25
|
+
title: str
|
|
26
|
+
|
|
27
|
+
text: str
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@define
|
|
31
|
+
class CordFullTextDocument(IDHolder, Document):
|
|
32
|
+
title: str
|
|
33
|
+
doi: str
|
|
34
|
+
date: str
|
|
35
|
+
abstract: str
|
|
36
|
+
body: Tuple[Cord19FullTextSection, ...]
|
|
37
|
+
|
|
38
|
+
has_text: ClassVar[bool] = True
|
|
39
|
+
|
|
40
|
+
def get_text(self):
|
|
41
|
+
return f"{self.abstract}"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@define
|
|
45
|
+
class MsMarcoDocument(IDHolder, Document):
|
|
46
|
+
url: str
|
|
47
|
+
title: str
|
|
48
|
+
body: str
|
|
49
|
+
|
|
50
|
+
has_text: ClassVar[bool] = True
|
|
51
|
+
|
|
52
|
+
def get_text(self):
|
|
53
|
+
return f"{self.body}"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@define
|
|
57
|
+
class NFCorpusDocument(IDHolder, Document):
|
|
58
|
+
url: str
|
|
59
|
+
title: str
|
|
60
|
+
abstract: str
|
|
61
|
+
|
|
62
|
+
has_text: ClassVar[bool] = True
|
|
63
|
+
|
|
64
|
+
def get_text(self):
|
|
65
|
+
return f"{self.abstract}"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@define
|
|
69
|
+
class TitleDocument(IDHolder, Document):
|
|
70
|
+
text: str
|
|
71
|
+
title: str
|
|
72
|
+
has_text: ClassVar[bool] = True
|
|
73
|
+
|
|
74
|
+
def get_text(self):
|
|
75
|
+
return f"{self.title} {self.text}"
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@define
|
|
79
|
+
class TitleUrlDocument(IDHolder, Document):
|
|
80
|
+
text: str
|
|
81
|
+
title: str
|
|
82
|
+
url: str
|
|
83
|
+
has_text: ClassVar[bool] = True
|
|
84
|
+
|
|
85
|
+
def get_text(self):
|
|
86
|
+
return f"{self.title} {self.text}"
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@define
|
|
90
|
+
class TrecParsedDocument(IDHolder, Document):
|
|
91
|
+
title: str
|
|
92
|
+
body: str
|
|
93
|
+
marked_up_doc: bytes
|
|
94
|
+
|
|
95
|
+
has_text: ClassVar[bool] = True
|
|
96
|
+
|
|
97
|
+
def get_text(self):
|
|
98
|
+
return f"{self.title} {self.body}"
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@define
|
|
102
|
+
class WapoDocument(IDHolder, Document):
|
|
103
|
+
url: str
|
|
104
|
+
title: str
|
|
105
|
+
author: str
|
|
106
|
+
published_date: int
|
|
107
|
+
kicker: str
|
|
108
|
+
body: str
|
|
109
|
+
body_paras_html: Tuple[str, ...]
|
|
110
|
+
body_media: Tuple[WapoDocMedia, ...]
|
|
111
|
+
|
|
112
|
+
has_text: ClassVar[bool] = True
|
|
113
|
+
|
|
114
|
+
def get_text(self):
|
|
115
|
+
return f"{self.body}"
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@define
|
|
119
|
+
class TweetDoc(IDHolder, Document):
|
|
120
|
+
text: str
|
|
121
|
+
user_id: str
|
|
122
|
+
created_at: str
|
|
123
|
+
lang: str
|
|
124
|
+
reply_doc_id: str
|
|
125
|
+
retweet_doc_id: str
|
|
126
|
+
source: bytes
|
|
127
|
+
source_content_type: str
|
|
128
|
+
|
|
129
|
+
def get_text(self):
|
|
130
|
+
return f"{self.text}"
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@define
|
|
134
|
+
class TrecTopic(GenericTopic):
|
|
135
|
+
text: str
|
|
136
|
+
query: str
|
|
137
|
+
narrative: str
|
|
138
|
+
|
|
139
|
+
def get_text(self):
|
|
140
|
+
return f"{self.text}"
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@define
|
|
144
|
+
class UrlTopic(GenericTopic):
|
|
145
|
+
text: str
|
|
146
|
+
url: str
|
|
147
|
+
|
|
148
|
+
def get_text(self):
|
|
149
|
+
return f"{self.text}"
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
@define
|
|
153
|
+
class NFCorpusTopic(IDTopic):
|
|
154
|
+
title: str
|
|
155
|
+
all: str
|
|
156
|
+
|
|
157
|
+
def get_text(self):
|
|
158
|
+
return f"{self.title}"
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
@define
|
|
162
|
+
class TrecQuery(IDTopic):
|
|
163
|
+
title: str
|
|
164
|
+
description: str
|
|
165
|
+
narrative: str
|
|
166
|
+
|
|
167
|
+
def get_text(self):
|
|
168
|
+
return f"{self.description}"
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
@define
|
|
172
|
+
class TrecMb13Query(IDTopic):
|
|
173
|
+
query: str
|
|
174
|
+
time: str
|
|
175
|
+
tweet_time: str
|
|
176
|
+
|
|
177
|
+
def get_text(self):
|
|
178
|
+
return f"{self.query}"
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
@define
|
|
182
|
+
class TrecMb14Query(IDTopic):
|
|
183
|
+
query: str
|
|
184
|
+
time: str
|
|
185
|
+
tweet_time: str
|
|
186
|
+
description: str
|
|
187
|
+
|
|
188
|
+
def get_text(self):
|
|
189
|
+
return f"{self.query}"
|
|
@@ -2,7 +2,13 @@ import logging
|
|
|
2
2
|
from typing import Any, Iterator, Tuple, Type, List
|
|
3
3
|
import attrs
|
|
4
4
|
import ir_datasets
|
|
5
|
-
from ir_datasets.formats import
|
|
5
|
+
from ir_datasets.formats import (
|
|
6
|
+
GenericDoc,
|
|
7
|
+
GenericQuery,
|
|
8
|
+
GenericDocPair,
|
|
9
|
+
TrecParsedDoc,
|
|
10
|
+
TrecQuery,
|
|
11
|
+
)
|
|
6
12
|
import ir_datasets.datasets as _irds
|
|
7
13
|
from experimaestro import Config
|
|
8
14
|
from experimaestro.compat import cached_property
|
|
@@ -72,9 +78,10 @@ class tuple_constructor:
|
|
|
72
78
|
self.fields = fields
|
|
73
79
|
|
|
74
80
|
def check(self, source_cls: Type):
|
|
75
|
-
assert (
|
|
76
|
-
|
|
77
|
-
|
|
81
|
+
assert source_cls._fields == self.fields, (
|
|
82
|
+
"Internal error: Fields do not match, "
|
|
83
|
+
f"source({source_cls.__qualname__})={source_cls._fields} [vs] target={self.fields}"
|
|
84
|
+
)
|
|
78
85
|
|
|
79
86
|
def __call__(self, entry):
|
|
80
87
|
return self.target_cls(*tuple(entry))
|
|
@@ -91,6 +98,54 @@ class Documents(ir.DocumentStore, IRDSId):
|
|
|
91
98
|
_irds.beir.BeirCordDoc: tuple_constructor(
|
|
92
99
|
formats.CordDocument, "doc_id", "text", "title", "url", "pubmed_id"
|
|
93
100
|
),
|
|
101
|
+
_irds.beir.BeirTitleDoc: tuple_constructor(
|
|
102
|
+
formats.TitleDocument, "doc_id", "text", "title"
|
|
103
|
+
),
|
|
104
|
+
_irds.beir.BeirTitleUrlDoc: tuple_constructor(
|
|
105
|
+
formats.TitleUrlDocument, "doc_id", "text", "title", "url"
|
|
106
|
+
),
|
|
107
|
+
_irds.msmarco_document.MsMarcoDocument: tuple_constructor(
|
|
108
|
+
formats.MsMarcoDocument, "doc_id", "url", "title", "body"
|
|
109
|
+
),
|
|
110
|
+
_irds.cord19.Cord19FullTextDoc: tuple_constructor(
|
|
111
|
+
formats.CordFullTextDocument,
|
|
112
|
+
"doc_id",
|
|
113
|
+
"title",
|
|
114
|
+
"doi",
|
|
115
|
+
"date",
|
|
116
|
+
"abstract",
|
|
117
|
+
"body",
|
|
118
|
+
),
|
|
119
|
+
_irds.nfcorpus.NfCorpusDoc: tuple_constructor(
|
|
120
|
+
formats.NFCorpusDocument, "doc_id", "url", "title", "abstract"
|
|
121
|
+
),
|
|
122
|
+
TrecParsedDoc: tuple_constructor(
|
|
123
|
+
formats.TrecParsedDocument, "doc_id", "title", "body", "marked_up_doc"
|
|
124
|
+
),
|
|
125
|
+
_irds.wapo.WapoDoc: tuple_constructor(
|
|
126
|
+
formats.WapoDocument,
|
|
127
|
+
"doc_id",
|
|
128
|
+
"url",
|
|
129
|
+
"title",
|
|
130
|
+
"author",
|
|
131
|
+
"published_date",
|
|
132
|
+
"kicker",
|
|
133
|
+
"body",
|
|
134
|
+
"body_paras_html",
|
|
135
|
+
"body_media",
|
|
136
|
+
),
|
|
137
|
+
_irds.tweets2013_ia.TweetDoc: tuple_constructor(
|
|
138
|
+
formats.TweetDoc,
|
|
139
|
+
"doc_id",
|
|
140
|
+
"text",
|
|
141
|
+
"user_id",
|
|
142
|
+
"created_at",
|
|
143
|
+
"lang",
|
|
144
|
+
"reply_doc_id",
|
|
145
|
+
"retweet_doc_id",
|
|
146
|
+
"source",
|
|
147
|
+
"source_content_type",
|
|
148
|
+
),
|
|
94
149
|
}
|
|
95
150
|
|
|
96
151
|
"""Wraps an ir datasets collection -- and provide a default text
|
|
@@ -147,6 +202,12 @@ class Documents(ir.DocumentStore, IRDSId):
|
|
|
147
202
|
return converter
|
|
148
203
|
|
|
149
204
|
|
|
205
|
+
if hasattr(_irds, "miracl"):
|
|
206
|
+
Documents.CONVERTERS[_irds.miracl.MiraclDoc] = tuple_constructor(
|
|
207
|
+
formats.DocumentWithTitle, "doc_id", "title", "text"
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
|
|
150
211
|
@attrs.define()
|
|
151
212
|
class IRDSQueryWrapper(ir.Topic):
|
|
152
213
|
query: Any
|
|
@@ -158,6 +219,26 @@ class Topics(ir.TopicsStore, IRDSId):
|
|
|
158
219
|
_irds.beir.BeirCovidQuery: tuple_constructor(
|
|
159
220
|
formats.TrecTopic, "query_id", "text", "query", "narrative"
|
|
160
221
|
),
|
|
222
|
+
_irds.beir.BeirUrlQuery: tuple_constructor(
|
|
223
|
+
formats.UrlTopic, "query_id", "text", "url"
|
|
224
|
+
),
|
|
225
|
+
_irds.nfcorpus.NfCorpusQuery: tuple_constructor(
|
|
226
|
+
formats.NFCorpusTopic, "query_id", "title", "all"
|
|
227
|
+
),
|
|
228
|
+
TrecQuery: tuple_constructor(
|
|
229
|
+
formats.TrecQuery, "query_id", "title", "description", "narrative"
|
|
230
|
+
),
|
|
231
|
+
_irds.tweets2013_ia.TrecMb13Query: tuple_constructor(
|
|
232
|
+
formats.TrecMb13Query, "query_id", "query", "time", "tweet_time"
|
|
233
|
+
),
|
|
234
|
+
_irds.tweets2013_ia.TrecMb14Query: tuple_constructor(
|
|
235
|
+
formats.TrecMb14Query,
|
|
236
|
+
"query_id",
|
|
237
|
+
"query",
|
|
238
|
+
"time",
|
|
239
|
+
"tweet_time",
|
|
240
|
+
"description",
|
|
241
|
+
),
|
|
161
242
|
}
|
|
162
243
|
|
|
163
244
|
def iter(self) -> Iterator[ir.Topic]:
|
|
@@ -12,5 +12,5 @@ __version__: str
|
|
|
12
12
|
__version_tuple__: VERSION_TUPLE
|
|
13
13
|
version_tuple: VERSION_TUPLE
|
|
14
14
|
|
|
15
|
-
__version__ = version = '2023.
|
|
16
|
-
__version_tuple__ = version_tuple = (2023,
|
|
15
|
+
__version__ = version = '2023.11.22'
|
|
16
|
+
__version_tuple__ = version_tuple = (2023, 11, 22)
|
{datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22/src/datamaestro_text.egg-info}/PKG-INFO
RENAMED
|
@@ -1,16 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datamaestro-text
|
|
3
|
-
Version: 2023.
|
|
3
|
+
Version: 2023.11.22
|
|
4
4
|
Summary: Datamaestro module for text-related datasets
|
|
5
|
-
Home-page: https://github.com/experimaestro/datamaestro_text
|
|
6
|
-
Author: Benjamin Piwowarski
|
|
7
5
|
Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
|
|
8
6
|
License: GPL-3
|
|
9
7
|
Project-URL: homepage, https://github.com/experimaestro/datamaestro_text
|
|
10
8
|
Project-URL: documentation, https://datamaestro-text.readthedocs.io/en/latest/
|
|
11
9
|
Project-URL: repository, https://github.com/experimaestro/datamaestro_text
|
|
12
10
|
Keywords: dataset manager,information retrieval,experiments
|
|
13
|
-
Platform: any
|
|
14
11
|
Classifier: Development Status :: 4 - Beta
|
|
15
12
|
Classifier: Intended Audience :: Science/Research
|
|
16
13
|
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
@@ -20,8 +17,9 @@ Classifier: Programming Language :: Python :: 3
|
|
|
20
17
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
18
|
Requires-Python: >=3.8
|
|
22
19
|
Description-Content-Type: text/markdown
|
|
23
|
-
Provides-Extra: test
|
|
24
20
|
License-File: LICENSE
|
|
21
|
+
Requires-Dist: datamaestro>=0.8.16
|
|
22
|
+
Requires-Dist: attrs
|
|
25
23
|
|
|
26
24
|
[](https://github.com/pre-commit/pre-commit) [](https://badge.fury.io/py/datamaestro-text)
|
|
27
25
|
|
|
@@ -10,8 +10,6 @@ mkdocs.yml
|
|
|
10
10
|
pyproject.toml
|
|
11
11
|
requirements-dev.txt
|
|
12
12
|
requirements.txt
|
|
13
|
-
setup.cfg
|
|
14
|
-
setup.py
|
|
15
13
|
tox.ini
|
|
16
14
|
.circleci/config.yml
|
|
17
15
|
.github/workflows/pytest.yml
|
|
@@ -39,7 +37,6 @@ src/datamaestro_text.egg-info/dependency_links.txt
|
|
|
39
37
|
src/datamaestro_text.egg-info/entry_points.txt
|
|
40
38
|
src/datamaestro_text.egg-info/requires.txt
|
|
41
39
|
src/datamaestro_text.egg-info/top_level.txt
|
|
42
|
-
src/datamaestro_text.egg-info/zip-safe
|
|
43
40
|
src/datamaestro_text/config/__init__.py
|
|
44
41
|
src/datamaestro_text/config/ai/quac.yaml
|
|
45
42
|
src/datamaestro_text/config/com/oscar-corpus.py
|
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
[metadata]
|
|
2
|
-
name = datamaestro_text
|
|
3
|
-
author = Benjamin Piwowarski
|
|
4
|
-
author-email = benjamin@piwowarski.fr
|
|
5
|
-
home-page = https://github.com/experimaestro/datamaestro_text
|
|
6
|
-
description = "Text related datasets"
|
|
7
|
-
long-description = file: README.md, CHANGELOG.md
|
|
8
|
-
long-description-content-type = text/markdown
|
|
9
|
-
license = GPL-3
|
|
10
|
-
license_file = LICENSE
|
|
11
|
-
platform = any
|
|
12
|
-
keywords = dataset manager
|
|
13
|
-
classifiers =
|
|
14
|
-
Development Status :: 4 - Beta
|
|
15
|
-
Intended Audience :: Science/Research
|
|
16
|
-
License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
17
|
-
Operating System :: OS Independent
|
|
18
|
-
Programming Language :: Python
|
|
19
|
-
Programming Language :: Python :: 3
|
|
20
|
-
Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
-
|
|
22
|
-
[options]
|
|
23
|
-
zip_safe = true
|
|
24
|
-
include_package_data = true
|
|
25
|
-
python_requires = >= 3.8
|
|
26
|
-
test_suite = datamaestro.test
|
|
27
|
-
setup_requires =
|
|
28
|
-
setuptools >= 65.0.0
|
|
29
|
-
setuptools_scm
|
|
30
|
-
wheel
|
|
31
|
-
|
|
32
|
-
[options.extras_require]
|
|
33
|
-
test =
|
|
34
|
-
tox
|
|
35
|
-
|
|
36
|
-
[mypy]
|
|
37
|
-
python_version = 3.7
|
|
38
|
-
warn_unused_ignores = True
|
|
39
|
-
|
|
40
|
-
[flake8]
|
|
41
|
-
doctests = True
|
|
42
|
-
exclude = .git, .eggs, __pycache__, tests/, docs/, build/, dist/
|
|
43
|
-
max-line-length = 88
|
|
44
|
-
extend-ignore = E203
|
|
45
|
-
|
|
46
|
-
[egg_info]
|
|
47
|
-
tag_build =
|
|
48
|
-
tag_date = 0
|
|
49
|
-
|
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
from typing import ClassVar
|
|
2
|
-
from attrs import define
|
|
3
|
-
from .base import IDHolder, Document, GenericTopic
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
@define
|
|
7
|
-
class CordDocument(IDHolder, Document):
|
|
8
|
-
text: str
|
|
9
|
-
title: str
|
|
10
|
-
url: str
|
|
11
|
-
pubmed_id: str
|
|
12
|
-
|
|
13
|
-
has_text: ClassVar[bool] = True
|
|
14
|
-
|
|
15
|
-
def get_text(self):
|
|
16
|
-
return f"{self.title} {self.text}"
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
@define
|
|
20
|
-
class TrecTopic(GenericTopic):
|
|
21
|
-
text: str
|
|
22
|
-
query: str
|
|
23
|
-
narrative: str
|
|
24
|
-
|
|
25
|
-
def get_text(self):
|
|
26
|
-
return f"{self.query}"
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/api/conversation.rst
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/datasets/conversation.rst
RENAMED
|
File without changes
|
{datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/datasets/embeddings.rst
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/datasets/recommendation.rst
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/__init__.py
RENAMED
|
File without changes
|
{datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/__init__.py
RENAMED
|
File without changes
|
{datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/ai/quac.yaml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/embeddings.py
RENAMED
|
File without changes
|
{datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/ir/__init__.py
RENAMED
|
File without changes
|
{datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/ir/base.py
RENAMED
|
File without changes
|
{datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/ir/cord19.py
RENAMED
|
File without changes
|
{datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/ir/csv.py
RENAMED
|
File without changes
|
{datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/ir/data.py
RENAMED
|
File without changes
|
|
File without changes
|
{datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/ir/trec.py
RENAMED
|
File without changes
|
{datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/ir/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
{datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/tagging.py
RENAMED
|
File without changes
|
{datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/text.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/download/tmdb.py
RENAMED
|
File without changes
|
|
File without changes
|
{datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/interfaces/trec.py
RENAMED
|
File without changes
|
{datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/test/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/utils/__init__.py
RENAMED
|
File without changes
|
{datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/utils/files.py
RENAMED
|
File without changes
|
{datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/utils/iter.py
RENAMED
|
File without changes
|
|
File without changes
|
{datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/utils/shuffle.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|