datamaestro-text 2023.10.10__tar.gz → 2023.11.22__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/.github/workflows/pytest.yml +1 -1
  2. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/.github/workflows/python-publish.yml +2 -2
  3. {datamaestro-text-2023.10.10/src/datamaestro_text.egg-info → datamaestro-text-2023.11.22}/PKG-INFO +3 -5
  4. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/pyproject.toml +4 -0
  5. datamaestro-text-2023.11.22/setup.cfg +4 -0
  6. datamaestro-text-2023.11.22/src/datamaestro_text/data/ir/formats.py +189 -0
  7. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/datasets/irds/data.py +85 -4
  8. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/version.py +2 -2
  9. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22/src/datamaestro_text.egg-info}/PKG-INFO +3 -5
  10. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text.egg-info/SOURCES.txt +0 -3
  11. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text.egg-info/requires.txt +0 -3
  12. datamaestro-text-2023.10.10/setup.cfg +0 -49
  13. datamaestro-text-2023.10.10/setup.py +0 -9
  14. datamaestro-text-2023.10.10/src/datamaestro_text/data/ir/formats.py +0 -26
  15. datamaestro-text-2023.10.10/src/datamaestro_text.egg-info/zip-safe +0 -1
  16. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/.circleci/config.yml +0 -0
  17. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/.flake8 +0 -0
  18. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/.gitignore +0 -0
  19. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/.pre-commit-config.yaml +0 -0
  20. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/.readthedocs.yml +0 -0
  21. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/LICENSE +0 -0
  22. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/MANIFEST.in +0 -0
  23. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/Makefile +0 -0
  24. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/README.md +0 -0
  25. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/Makefile +0 -0
  26. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/make.bat +0 -0
  27. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/requirements.txt +0 -0
  28. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/api/conversation.rst +0 -0
  29. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/api/index.rst +0 -0
  30. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/api/ir.rst +0 -0
  31. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/api/text.rst +0 -0
  32. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/conf.py +0 -0
  33. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/datasets/conversation.rst +0 -0
  34. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/datasets/embeddings.rst +0 -0
  35. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/datasets/index.rst +0 -0
  36. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/datasets/ir.rst +0 -0
  37. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/datasets/recommendation.rst +0 -0
  38. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/datasets/text.rst +0 -0
  39. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/docs/source/index.rst +0 -0
  40. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/mkdocs.yml +0 -0
  41. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/requirements-dev.txt +0 -0
  42. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/requirements.txt +0 -0
  43. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/__init__.py +0 -0
  44. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/__init__.py +0 -0
  45. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/ai/quac.yaml +0 -0
  46. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml +0 -0
  47. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/com/github/aagohary/canard.py +0 -0
  48. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/com/github/prdwb/orconvqa.py +0 -0
  49. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/com/github/soskek/bookcorpus.yaml +0 -0
  50. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py +0 -0
  51. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/com/microsoft/wikiqa.yaml +0 -0
  52. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/com/oscar-corpus.py +0 -0
  53. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/com/sentiment140.py +0 -0
  54. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/com/smashwords/bookcorpus.py +0 -0
  55. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/edu/cornell/nlvr.yaml +0 -0
  56. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/edu/stanford/__init__.py +0 -0
  57. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/edu/stanford/aclimdb.py +0 -0
  58. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/edu/stanford/glove.py +0 -0
  59. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/edu/stanford/im2p.yaml +0 -0
  60. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/edu/upenn/__init__.py +0 -0
  61. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/edu/upenn/ldc/__init__.py +0 -0
  62. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py +0 -0
  63. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/gov/__init__.py +0 -0
  64. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/gov/nist/__init__.py +0 -0
  65. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/gov/nist/ir/covid.py +0 -0
  66. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/gov/nist/trec/__init__.py +0 -0
  67. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/gov/nist/trec/adhoc.py +0 -0
  68. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/gov/nist/trec/clueweb.yaml +0 -0
  69. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/gov/nist/trec/deeplearning.yaml +0 -0
  70. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/gov/nist/trec/index.yaml +0 -0
  71. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/gov/nist/trec/tipster.py +0 -0
  72. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/gov/nist/trec/web.yaml +0 -0
  73. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/io/github/rajpurkar/squad.yaml +0 -0
  74. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/io/github/thunlp/fewrel.py +0 -0
  75. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/io/metamind/research/__init__.py +0 -0
  76. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/io/metamind/research/wikitext.py +0 -0
  77. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/net/mattmahoney/enwiki.yaml +0 -0
  78. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/org/acm/recsys/cb2014.yaml +0 -0
  79. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/org/cocodataset/index.yaml +0 -0
  80. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/org/grouplens/movielens.py +0 -0
  81. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/org/universaldependencies/french.py +0 -0
  82. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml +0 -0
  83. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/conversation/__init__.py +0 -0
  84. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/conversation/base.py +0 -0
  85. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/conversation/canard.py +0 -0
  86. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/conversation/orconvqa.py +0 -0
  87. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/embeddings.py +0 -0
  88. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/ir/__init__.py +0 -0
  89. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/ir/base.py +0 -0
  90. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/ir/cord19.py +0 -0
  91. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/ir/csv.py +0 -0
  92. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/ir/data.py +0 -0
  93. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/ir/huggingface.py +0 -0
  94. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/ir/trec.py +0 -0
  95. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/ir/utils.py +0 -0
  96. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/recommendation.py +0 -0
  97. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/tagging.py +0 -0
  98. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/data/text.py +0 -0
  99. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/datasets/irds/__init__.py +0 -0
  100. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/datasets/irds/datasets.py +0 -0
  101. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/datasets/irds/utils.py +0 -0
  102. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/download/tmdb.py +0 -0
  103. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/interfaces/plaintext.py +0 -0
  104. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/interfaces/trec.py +0 -0
  105. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/test/__init__.py +0 -0
  106. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/test/test_datasets.py +0 -0
  107. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/transforms/__init__.py +0 -0
  108. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/transforms/ir/__init__.py +0 -0
  109. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/utils/__init__.py +0 -0
  110. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/utils/files.py +0 -0
  111. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/utils/iter.py +0 -0
  112. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/utils/randomstream.py +0 -0
  113. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text/utils/shuffle.py +0 -0
  114. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text.egg-info/dependency_links.txt +0 -0
  115. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text.egg-info/entry_points.txt +0 -0
  116. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/src/datamaestro_text.egg-info/top_level.txt +0 -0
  117. {datamaestro-text-2023.10.10 → datamaestro-text-2023.11.22}/tox.ini +0 -0
@@ -15,7 +15,7 @@ jobs:
15
15
  runs-on: ubuntu-latest
16
16
  strategy:
17
17
  matrix:
18
- python-version: [3.8, 3.9, "3.10"]
18
+ python-version: [3.8, 3.9, "3.10", "3.11"]
19
19
 
20
20
  steps:
21
21
  - uses: actions/checkout@v2
@@ -20,11 +20,11 @@ jobs:
20
20
  - name: Install dependencies
21
21
  run: |
22
22
  python -m pip install --upgrade pip
23
- pip install setuptools wheel twine
23
+ pip install setuptools wheel twine build
24
24
  - name: Build and publish
25
25
  env:
26
26
  TWINE_USERNAME: __token__
27
27
  TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
28
28
  run: |
29
- python setup.py sdist bdist_wheel
29
+ python -m build --sdist --wheel
30
30
  twine upload dist/*
@@ -1,16 +1,13 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamaestro-text
3
- Version: 2023.10.10
3
+ Version: 2023.11.22
4
4
  Summary: Datamaestro module for text-related datasets
5
- Home-page: https://github.com/experimaestro/datamaestro_text
6
- Author: Benjamin Piwowarski
7
5
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
8
6
  License: GPL-3
9
7
  Project-URL: homepage, https://github.com/experimaestro/datamaestro_text
10
8
  Project-URL: documentation, https://datamaestro-text.readthedocs.io/en/latest/
11
9
  Project-URL: repository, https://github.com/experimaestro/datamaestro_text
12
10
  Keywords: dataset manager,information retrieval,experiments
13
- Platform: any
14
11
  Classifier: Development Status :: 4 - Beta
15
12
  Classifier: Intended Audience :: Science/Research
16
13
  Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
@@ -20,8 +17,9 @@ Classifier: Programming Language :: Python :: 3
20
17
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
18
  Requires-Python: >=3.8
22
19
  Description-Content-Type: text/markdown
23
- Provides-Extra: test
24
20
  License-File: LICENSE
21
+ Requires-Dist: datamaestro>=0.8.16
22
+ Requires-Dist: attrs
25
23
 
26
24
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![PyPI version](https://badge.fury.io/py/datamaestro-text.svg)](https://badge.fury.io/py/datamaestro-text)
27
25
 
@@ -18,6 +18,10 @@ classifiers = [
18
18
  "Topic :: Software Development :: Libraries :: Python Modules",
19
19
  ]
20
20
 
21
+ [tool.setuptools.dynamic]
22
+ dependencies = {file = ["requirements.txt"]}
23
+ readme = {file = ["README.md"], content-type = "text/markdown"}
24
+
21
25
  [project.urls]
22
26
  homepage = "https://github.com/experimaestro/datamaestro_text"
23
27
  documentation = "https://datamaestro-text.readthedocs.io/en/latest/"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,189 @@
1
+ from typing import ClassVar, Tuple
2
+ from attrs import define
3
+ from ir_datasets.datasets.wapo import WapoDocMedia
4
+ from .base import IDHolder, Document, GenericTopic, IDTopic
5
+ from ir_datasets.datasets.cord19 import Cord19FullTextSection
6
+
7
+
8
+ @define
9
+ class CordDocument(IDHolder, Document):
10
+ text: str
11
+ title: str
12
+ url: str
13
+ pubmed_id: str
14
+
15
+ has_text: ClassVar[bool] = True
16
+
17
+ def get_text(self):
18
+ return f"{self.title} {self.text}"
19
+
20
+
21
+ @define
22
+ class DocumentWithTitle(IDHolder, Document):
23
+ """Web document with title and URL"""
24
+
25
+ title: str
26
+
27
+ text: str
28
+
29
+
30
+ @define
31
+ class CordFullTextDocument(IDHolder, Document):
32
+ title: str
33
+ doi: str
34
+ date: str
35
+ abstract: str
36
+ body: Tuple[Cord19FullTextSection, ...]
37
+
38
+ has_text: ClassVar[bool] = True
39
+
40
+ def get_text(self):
41
+ return f"{self.abstract}"
42
+
43
+
44
+ @define
45
+ class MsMarcoDocument(IDHolder, Document):
46
+ url: str
47
+ title: str
48
+ body: str
49
+
50
+ has_text: ClassVar[bool] = True
51
+
52
+ def get_text(self):
53
+ return f"{self.body}"
54
+
55
+
56
+ @define
57
+ class NFCorpusDocument(IDHolder, Document):
58
+ url: str
59
+ title: str
60
+ abstract: str
61
+
62
+ has_text: ClassVar[bool] = True
63
+
64
+ def get_text(self):
65
+ return f"{self.abstract}"
66
+
67
+
68
+ @define
69
+ class TitleDocument(IDHolder, Document):
70
+ text: str
71
+ title: str
72
+ has_text: ClassVar[bool] = True
73
+
74
+ def get_text(self):
75
+ return f"{self.title} {self.text}"
76
+
77
+
78
+ @define
79
+ class TitleUrlDocument(IDHolder, Document):
80
+ text: str
81
+ title: str
82
+ url: str
83
+ has_text: ClassVar[bool] = True
84
+
85
+ def get_text(self):
86
+ return f"{self.title} {self.text}"
87
+
88
+
89
+ @define
90
+ class TrecParsedDocument(IDHolder, Document):
91
+ title: str
92
+ body: str
93
+ marked_up_doc: bytes
94
+
95
+ has_text: ClassVar[bool] = True
96
+
97
+ def get_text(self):
98
+ return f"{self.title} {self.body}"
99
+
100
+
101
+ @define
102
+ class WapoDocument(IDHolder, Document):
103
+ url: str
104
+ title: str
105
+ author: str
106
+ published_date: int
107
+ kicker: str
108
+ body: str
109
+ body_paras_html: Tuple[str, ...]
110
+ body_media: Tuple[WapoDocMedia, ...]
111
+
112
+ has_text: ClassVar[bool] = True
113
+
114
+ def get_text(self):
115
+ return f"{self.body}"
116
+
117
+
118
+ @define
119
+ class TweetDoc(IDHolder, Document):
120
+ text: str
121
+ user_id: str
122
+ created_at: str
123
+ lang: str
124
+ reply_doc_id: str
125
+ retweet_doc_id: str
126
+ source: bytes
127
+ source_content_type: str
128
+
129
+ def get_text(self):
130
+ return f"{self.text}"
131
+
132
+
133
+ @define
134
+ class TrecTopic(GenericTopic):
135
+ text: str
136
+ query: str
137
+ narrative: str
138
+
139
+ def get_text(self):
140
+ return f"{self.text}"
141
+
142
+
143
+ @define
144
+ class UrlTopic(GenericTopic):
145
+ text: str
146
+ url: str
147
+
148
+ def get_text(self):
149
+ return f"{self.text}"
150
+
151
+
152
+ @define
153
+ class NFCorpusTopic(IDTopic):
154
+ title: str
155
+ all: str
156
+
157
+ def get_text(self):
158
+ return f"{self.title}"
159
+
160
+
161
+ @define
162
+ class TrecQuery(IDTopic):
163
+ title: str
164
+ description: str
165
+ narrative: str
166
+
167
+ def get_text(self):
168
+ return f"{self.description}"
169
+
170
+
171
+ @define
172
+ class TrecMb13Query(IDTopic):
173
+ query: str
174
+ time: str
175
+ tweet_time: str
176
+
177
+ def get_text(self):
178
+ return f"{self.query}"
179
+
180
+
181
+ @define
182
+ class TrecMb14Query(IDTopic):
183
+ query: str
184
+ time: str
185
+ tweet_time: str
186
+ description: str
187
+
188
+ def get_text(self):
189
+ return f"{self.query}"
@@ -2,7 +2,13 @@ import logging
2
2
  from typing import Any, Iterator, Tuple, Type, List
3
3
  import attrs
4
4
  import ir_datasets
5
- from ir_datasets.formats import GenericDoc, GenericQuery, GenericDocPair
5
+ from ir_datasets.formats import (
6
+ GenericDoc,
7
+ GenericQuery,
8
+ GenericDocPair,
9
+ TrecParsedDoc,
10
+ TrecQuery,
11
+ )
6
12
  import ir_datasets.datasets as _irds
7
13
  from experimaestro import Config
8
14
  from experimaestro.compat import cached_property
@@ -72,9 +78,10 @@ class tuple_constructor:
72
78
  self.fields = fields
73
79
 
74
80
  def check(self, source_cls: Type):
75
- assert (
76
- source_cls._fields == self.fields
77
- ), f"Internal error: Fields do not match ({source_cls._fields} and {self.fields})"
81
+ assert source_cls._fields == self.fields, (
82
+ "Internal error: Fields do not match, "
83
+ f"source({source_cls.__qualname__})={source_cls._fields} [vs] target={self.fields}"
84
+ )
78
85
 
79
86
  def __call__(self, entry):
80
87
  return self.target_cls(*tuple(entry))
@@ -91,6 +98,54 @@ class Documents(ir.DocumentStore, IRDSId):
91
98
  _irds.beir.BeirCordDoc: tuple_constructor(
92
99
  formats.CordDocument, "doc_id", "text", "title", "url", "pubmed_id"
93
100
  ),
101
+ _irds.beir.BeirTitleDoc: tuple_constructor(
102
+ formats.TitleDocument, "doc_id", "text", "title"
103
+ ),
104
+ _irds.beir.BeirTitleUrlDoc: tuple_constructor(
105
+ formats.TitleUrlDocument, "doc_id", "text", "title", "url"
106
+ ),
107
+ _irds.msmarco_document.MsMarcoDocument: tuple_constructor(
108
+ formats.MsMarcoDocument, "doc_id", "url", "title", "body"
109
+ ),
110
+ _irds.cord19.Cord19FullTextDoc: tuple_constructor(
111
+ formats.CordFullTextDocument,
112
+ "doc_id",
113
+ "title",
114
+ "doi",
115
+ "date",
116
+ "abstract",
117
+ "body",
118
+ ),
119
+ _irds.nfcorpus.NfCorpusDoc: tuple_constructor(
120
+ formats.NFCorpusDocument, "doc_id", "url", "title", "abstract"
121
+ ),
122
+ TrecParsedDoc: tuple_constructor(
123
+ formats.TrecParsedDocument, "doc_id", "title", "body", "marked_up_doc"
124
+ ),
125
+ _irds.wapo.WapoDoc: tuple_constructor(
126
+ formats.WapoDocument,
127
+ "doc_id",
128
+ "url",
129
+ "title",
130
+ "author",
131
+ "published_date",
132
+ "kicker",
133
+ "body",
134
+ "body_paras_html",
135
+ "body_media",
136
+ ),
137
+ _irds.tweets2013_ia.TweetDoc: tuple_constructor(
138
+ formats.TweetDoc,
139
+ "doc_id",
140
+ "text",
141
+ "user_id",
142
+ "created_at",
143
+ "lang",
144
+ "reply_doc_id",
145
+ "retweet_doc_id",
146
+ "source",
147
+ "source_content_type",
148
+ ),
94
149
  }
95
150
 
96
151
  """Wraps an ir datasets collection -- and provide a default text
@@ -147,6 +202,12 @@ class Documents(ir.DocumentStore, IRDSId):
147
202
  return converter
148
203
 
149
204
 
205
+ if hasattr(_irds, "miracl"):
206
+ Documents.CONVERTERS[_irds.miracl.MiraclDoc] = tuple_constructor(
207
+ formats.DocumentWithTitle, "doc_id", "title", "text"
208
+ )
209
+
210
+
150
211
  @attrs.define()
151
212
  class IRDSQueryWrapper(ir.Topic):
152
213
  query: Any
@@ -158,6 +219,26 @@ class Topics(ir.TopicsStore, IRDSId):
158
219
  _irds.beir.BeirCovidQuery: tuple_constructor(
159
220
  formats.TrecTopic, "query_id", "text", "query", "narrative"
160
221
  ),
222
+ _irds.beir.BeirUrlQuery: tuple_constructor(
223
+ formats.UrlTopic, "query_id", "text", "url"
224
+ ),
225
+ _irds.nfcorpus.NfCorpusQuery: tuple_constructor(
226
+ formats.NFCorpusTopic, "query_id", "title", "all"
227
+ ),
228
+ TrecQuery: tuple_constructor(
229
+ formats.TrecQuery, "query_id", "title", "description", "narrative"
230
+ ),
231
+ _irds.tweets2013_ia.TrecMb13Query: tuple_constructor(
232
+ formats.TrecMb13Query, "query_id", "query", "time", "tweet_time"
233
+ ),
234
+ _irds.tweets2013_ia.TrecMb14Query: tuple_constructor(
235
+ formats.TrecMb14Query,
236
+ "query_id",
237
+ "query",
238
+ "time",
239
+ "tweet_time",
240
+ "description",
241
+ ),
161
242
  }
162
243
 
163
244
  def iter(self) -> Iterator[ir.Topic]:
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '2023.10.10'
16
- __version_tuple__ = version_tuple = (2023, 10, 10)
15
+ __version__ = version = '2023.11.22'
16
+ __version_tuple__ = version_tuple = (2023, 11, 22)
@@ -1,16 +1,13 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamaestro-text
3
- Version: 2023.10.10
3
+ Version: 2023.11.22
4
4
  Summary: Datamaestro module for text-related datasets
5
- Home-page: https://github.com/experimaestro/datamaestro_text
6
- Author: Benjamin Piwowarski
7
5
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
8
6
  License: GPL-3
9
7
  Project-URL: homepage, https://github.com/experimaestro/datamaestro_text
10
8
  Project-URL: documentation, https://datamaestro-text.readthedocs.io/en/latest/
11
9
  Project-URL: repository, https://github.com/experimaestro/datamaestro_text
12
10
  Keywords: dataset manager,information retrieval,experiments
13
- Platform: any
14
11
  Classifier: Development Status :: 4 - Beta
15
12
  Classifier: Intended Audience :: Science/Research
16
13
  Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
@@ -20,8 +17,9 @@ Classifier: Programming Language :: Python :: 3
20
17
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
18
  Requires-Python: >=3.8
22
19
  Description-Content-Type: text/markdown
23
- Provides-Extra: test
24
20
  License-File: LICENSE
21
+ Requires-Dist: datamaestro>=0.8.16
22
+ Requires-Dist: attrs
25
23
 
26
24
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![PyPI version](https://badge.fury.io/py/datamaestro-text.svg)](https://badge.fury.io/py/datamaestro-text)
27
25
 
@@ -10,8 +10,6 @@ mkdocs.yml
10
10
  pyproject.toml
11
11
  requirements-dev.txt
12
12
  requirements.txt
13
- setup.cfg
14
- setup.py
15
13
  tox.ini
16
14
  .circleci/config.yml
17
15
  .github/workflows/pytest.yml
@@ -39,7 +37,6 @@ src/datamaestro_text.egg-info/dependency_links.txt
39
37
  src/datamaestro_text.egg-info/entry_points.txt
40
38
  src/datamaestro_text.egg-info/requires.txt
41
39
  src/datamaestro_text.egg-info/top_level.txt
42
- src/datamaestro_text.egg-info/zip-safe
43
40
  src/datamaestro_text/config/__init__.py
44
41
  src/datamaestro_text/config/ai/quac.yaml
45
42
  src/datamaestro_text/config/com/oscar-corpus.py
@@ -1,49 +0,0 @@
1
- [metadata]
2
- name = datamaestro_text
3
- author = Benjamin Piwowarski
4
- author-email = benjamin@piwowarski.fr
5
- home-page = https://github.com/experimaestro/datamaestro_text
6
- description = "Text related datasets"
7
- long-description = file: README.md, CHANGELOG.md
8
- long-description-content-type = text/markdown
9
- license = GPL-3
10
- license_file = LICENSE
11
- platform = any
12
- keywords = dataset manager
13
- classifiers =
14
- Development Status :: 4 - Beta
15
- Intended Audience :: Science/Research
16
- License :: OSI Approved :: GNU General Public License v3 (GPLv3)
17
- Operating System :: OS Independent
18
- Programming Language :: Python
19
- Programming Language :: Python :: 3
20
- Topic :: Software Development :: Libraries :: Python Modules
21
-
22
- [options]
23
- zip_safe = true
24
- include_package_data = true
25
- python_requires = >= 3.8
26
- test_suite = datamaestro.test
27
- setup_requires =
28
- setuptools >= 65.0.0
29
- setuptools_scm
30
- wheel
31
-
32
- [options.extras_require]
33
- test =
34
- tox
35
-
36
- [mypy]
37
- python_version = 3.7
38
- warn_unused_ignores = True
39
-
40
- [flake8]
41
- doctests = True
42
- exclude = .git, .eggs, __pycache__, tests/, docs/, build/, dist/
43
- max-line-length = 88
44
- extend-ignore = E203
45
-
46
- [egg_info]
47
- tag_build =
48
- tag_date = 0
49
-
@@ -1,9 +0,0 @@
1
- from pathlib import Path
2
- from setuptools import setup
3
-
4
- basepath = Path(__file__).parent
5
-
6
-
7
- setup(
8
- install_requires=(basepath / "requirements.txt").read_text(), use_scm_version=True,
9
- )
@@ -1,26 +0,0 @@
1
- from typing import ClassVar
2
- from attrs import define
3
- from .base import IDHolder, Document, GenericTopic
4
-
5
-
6
- @define
7
- class CordDocument(IDHolder, Document):
8
- text: str
9
- title: str
10
- url: str
11
- pubmed_id: str
12
-
13
- has_text: ClassVar[bool] = True
14
-
15
- def get_text(self):
16
- return f"{self.title} {self.text}"
17
-
18
-
19
- @define
20
- class TrecTopic(GenericTopic):
21
- text: str
22
- query: str
23
- narrative: str
24
-
25
- def get_text(self):
26
- return f"{self.query}"