pyannotators-entityfishing 0.6.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyannotators_entityfishing/__init__.py +3 -0
- pyannotators_entityfishing/ef_client.py +242 -0
- pyannotators_entityfishing/entityfishing.py +347 -0
- pyannotators_entityfishing-0.6.3.dist-info/METADATA +110 -0
- pyannotators_entityfishing-0.6.3.dist-info/RECORD +7 -0
- pyannotators_entityfishing-0.6.3.dist-info/WHEEL +4 -0
- pyannotators_entityfishing-0.6.3.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import logging
|
|
3
|
+
import time
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
from concurrent.futures import as_completed
|
|
6
|
+
from datetime import timedelta
|
|
7
|
+
|
|
8
|
+
import requests
|
|
9
|
+
from collections_extended import RangeMap
|
|
10
|
+
from pymultirole_plugins.v1.schema import Document
|
|
11
|
+
from requests.adapters import DEFAULT_POOLSIZE, HTTPAdapter
|
|
12
|
+
from requests_cache import CachedSession
|
|
13
|
+
from requests_futures.sessions import FuturesSession
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger("ef-client")
|
|
16
|
+
|
|
17
|
+
DEFAULT_CHUNK_SIZE = 1000
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class EntityFishingClient:
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
base_url: str = "https://sherpa-entityfishing.kairntech.com",
|
|
24
|
+
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
|
25
|
+
cache_disamb: int = -1,
|
|
26
|
+
cache_concept: int = -1,
|
|
27
|
+
pool_size: int = 8,
|
|
28
|
+
full: bool = False,
|
|
29
|
+
):
|
|
30
|
+
self.base_url = base_url.rstrip("/")
|
|
31
|
+
self.chunk_size = chunk_size
|
|
32
|
+
self.full = full
|
|
33
|
+
|
|
34
|
+
self.dsession = (
|
|
35
|
+
CachedSession(
|
|
36
|
+
"file:disamb_cache?mode=memory&cache=shared",
|
|
37
|
+
backend="sqlite",
|
|
38
|
+
uri=True,
|
|
39
|
+
wal=True,
|
|
40
|
+
cache_control=True,
|
|
41
|
+
expire_after=timedelta(seconds=cache_disamb),
|
|
42
|
+
allowable_methods=["POST"],
|
|
43
|
+
)
|
|
44
|
+
if cache_disamb > 0
|
|
45
|
+
else requests.Session()
|
|
46
|
+
)
|
|
47
|
+
self.dsession.mount("http://", HTTPAdapter(pool_maxsize=max(DEFAULT_POOLSIZE, pool_size)))
|
|
48
|
+
self.dsession.mount("https://", HTTPAdapter(pool_maxsize=max(DEFAULT_POOLSIZE, pool_size)))
|
|
49
|
+
self.dsession.headers.update({"Content-Type": "application/json", "Accept": "application/json"})
|
|
50
|
+
self.dsession.verify = False
|
|
51
|
+
self.fdsession = FuturesSession(session=self.dsession, max_workers=pool_size)
|
|
52
|
+
|
|
53
|
+
self.ksession = (
|
|
54
|
+
CachedSession(
|
|
55
|
+
"file:concept_cache?mode=memory&cache=shared",
|
|
56
|
+
backend="sqlite",
|
|
57
|
+
uri=True,
|
|
58
|
+
wal=True,
|
|
59
|
+
cache_control=True,
|
|
60
|
+
expire_after=timedelta(seconds=cache_concept),
|
|
61
|
+
allowable_methods=["GET"],
|
|
62
|
+
)
|
|
63
|
+
if cache_concept > 0
|
|
64
|
+
else requests.Session()
|
|
65
|
+
)
|
|
66
|
+
self.ksession.mount("http://", HTTPAdapter(pool_maxsize=max(DEFAULT_POOLSIZE, pool_size)))
|
|
67
|
+
self.ksession.mount("https://", HTTPAdapter(pool_maxsize=max(DEFAULT_POOLSIZE, pool_size)))
|
|
68
|
+
self.ksession.headers.update({"Content-Type": "application/json", "Accept": "application/json"})
|
|
69
|
+
self.ksession.verify = False
|
|
70
|
+
self.fksession = FuturesSession(session=self.ksession, max_workers=pool_size)
|
|
71
|
+
|
|
72
|
+
self.disamb_url = "/service/disambiguate/"
|
|
73
|
+
self.kb_url = "/service/kb/concept/"
|
|
74
|
+
self.term_url = "/service/kb/term/"
|
|
75
|
+
|
|
76
|
+
def _build_disamb_query(self, text: str, sentences: list[dict], lang: str, minSelectorScore: float, maxTermFrequency: float | None, short_text: bool = False) -> dict:
|
|
77
|
+
text = text.replace("\r\n", " \n")
|
|
78
|
+
disamb_query = {
|
|
79
|
+
"sentences": sentences,
|
|
80
|
+
"language": {"lang": lang},
|
|
81
|
+
"mentions": ["wikipedia"],
|
|
82
|
+
"nbest": short_text,
|
|
83
|
+
"sentence": False,
|
|
84
|
+
"full": self.full,
|
|
85
|
+
"customisation": "generic",
|
|
86
|
+
"minSelectorScore": minSelectorScore,
|
|
87
|
+
}
|
|
88
|
+
if short_text:
|
|
89
|
+
disamb_query["shortText"] = text
|
|
90
|
+
else:
|
|
91
|
+
disamb_query["text"] = text
|
|
92
|
+
if maxTermFrequency is not None and isinstance(maxTermFrequency, float):
|
|
93
|
+
disamb_query["maxTermFrequency"] = maxTermFrequency
|
|
94
|
+
return disamb_query
|
|
95
|
+
|
|
96
|
+
def disamb_query(self, text: str, lang: str, minSelectorScore: float, maxTermFrequency: float | None, sents: tuple, short_text: bool = False) -> list:
|
|
97
|
+
sentences = [{"offsetStart": s[0], "offsetEnd": s[1]} for s in sents]
|
|
98
|
+
disamb_query = self._build_disamb_query(text, sentences, lang, minSelectorScore, maxTermFrequency, short_text)
|
|
99
|
+
try:
|
|
100
|
+
start = time.time()
|
|
101
|
+
resp = self.dsession.post(self.base_url + self.disamb_url, json=disamb_query, timeout=(30, 300))
|
|
102
|
+
duration = time.time() - start
|
|
103
|
+
logger.debug("EF disamb duration %0.3fs", duration)
|
|
104
|
+
if resp.ok:
|
|
105
|
+
result = resp.json()
|
|
106
|
+
return result.get("entities", [])
|
|
107
|
+
else:
|
|
108
|
+
resp.raise_for_status()
|
|
109
|
+
except Exception:
|
|
110
|
+
logging.warning("An exception was thrown!", exc_info=True)
|
|
111
|
+
return []
|
|
112
|
+
|
|
113
|
+
def disamb_queries(self, inputs: list[Document], langs: list[str], minSelectorScore: float, maxTermFrequency: float | None, short_text: bool = False) -> list[list]:
|
|
114
|
+
futures = []
|
|
115
|
+
results: list[list] = [[] for _ in range(len(inputs))]
|
|
116
|
+
start = time.time()
|
|
117
|
+
|
|
118
|
+
for idx, (doc, lang) in enumerate(zip(inputs, langs, strict=True)):
|
|
119
|
+
sents = [(s.start, s.end) for s in doc.sentences] if doc.sentences else [(0, len(doc.text))]
|
|
120
|
+
sentences = [{"offsetStart": s[0], "offsetEnd": s[1]} for s in sents]
|
|
121
|
+
disamb_query = self._build_disamb_query(doc.text, sentences, lang, minSelectorScore, maxTermFrequency, short_text)
|
|
122
|
+
future = self.fdsession.post(self.base_url + self.disamb_url, json=disamb_query, timeout=(30, 300))
|
|
123
|
+
future.idx = idx
|
|
124
|
+
futures.append(future)
|
|
125
|
+
|
|
126
|
+
for future in as_completed(futures):
|
|
127
|
+
try:
|
|
128
|
+
resp = future.result()
|
|
129
|
+
if resp.ok:
|
|
130
|
+
result = resp.json()
|
|
131
|
+
results[future.idx] = result.get("entities", [])
|
|
132
|
+
else:
|
|
133
|
+
resp.raise_for_status()
|
|
134
|
+
except Exception:
|
|
135
|
+
logging.warning("An exception was thrown!", exc_info=True)
|
|
136
|
+
duration = time.time() - start
|
|
137
|
+
logger.debug("EF disamb duration with %d docs %0.3fs", len(inputs), duration)
|
|
138
|
+
return results
|
|
139
|
+
|
|
140
|
+
@staticmethod
|
|
141
|
+
def group_sentences(text: str, sentences: list[dict], chunk_size: int = DEFAULT_CHUNK_SIZE):
|
|
142
|
+
chunks = RangeMap()
|
|
143
|
+
for sent in sentences:
|
|
144
|
+
if sent["offsetStart"] < sent["offsetEnd"]:
|
|
145
|
+
chunks[sent["offsetStart"] : sent["offsetEnd"]] = sent
|
|
146
|
+
cstart = 0
|
|
147
|
+
cend = 0
|
|
148
|
+
sstart = 0
|
|
149
|
+
while cend < len(text):
|
|
150
|
+
ranges = chunks.get_range(cstart, cstart + chunk_size)
|
|
151
|
+
if ranges.start is None or ranges.end is None:
|
|
152
|
+
break
|
|
153
|
+
send = sstart + len(ranges)
|
|
154
|
+
first_sent = ranges[ranges.start]
|
|
155
|
+
last_sent = ranges[ranges.end - 1]
|
|
156
|
+
if first_sent is not None and "offsetStart" in first_sent:
|
|
157
|
+
cstart = first_sent["offsetStart"]
|
|
158
|
+
if last_sent is not None and "offsetEnd" in last_sent:
|
|
159
|
+
cend = last_sent["offsetEnd"]
|
|
160
|
+
yield (cstart, cend, sstart, send)
|
|
161
|
+
cstart = cend
|
|
162
|
+
sstart = send
|
|
163
|
+
|
|
164
|
+
@staticmethod
|
|
165
|
+
def chunk_sentences(text: str, sentences: list[dict], chunk_size: int = DEFAULT_CHUNK_SIZE) -> list[tuple]:
|
|
166
|
+
chunks = list(EntityFishingClient.group_sentences(text, sentences, chunk_size))
|
|
167
|
+
# If last chunk is too small aggregate it to the previous
|
|
168
|
+
if len(chunks) > 1 and (chunks[-1][1] - chunks[-1][0]) < chunk_size / 10:
|
|
169
|
+
last_chunk = chunks.pop()
|
|
170
|
+
chunks[-1] = (chunks[-1][0], last_chunk[1], chunks[-1][2], last_chunk[3])
|
|
171
|
+
return chunks
|
|
172
|
+
|
|
173
|
+
def disamb_query_chunks(self, text: str, lang: str, minSelectorScore: float, maxTermFrequency: float | None, sents: tuple, short_text: bool = False) -> list:
|
|
174
|
+
sentences = [{"offsetStart": s[0], "offsetEnd": s[1]} for s in sents]
|
|
175
|
+
if self.chunk_size < 0:
|
|
176
|
+
return self.disamb_query(text, lang, minSelectorScore, maxTermFrequency, sents, short_text)
|
|
177
|
+
|
|
178
|
+
start = time.time()
|
|
179
|
+
text = text.replace("\r\n", " \n")
|
|
180
|
+
entities = []
|
|
181
|
+
chunks = self.chunk_sentences(text, sentences, self.chunk_size)
|
|
182
|
+
futures = []
|
|
183
|
+
cstarts = {}
|
|
184
|
+
for cstart, cend, sstart, send in chunks:
|
|
185
|
+
ctext = text[cstart:cend]
|
|
186
|
+
cid = int(hashlib.md5(ctext.encode("utf-8")).hexdigest(), 16)
|
|
187
|
+
csentences = [{"offsetStart": s["offsetStart"] - cstart, "offsetEnd": s["offsetEnd"] - cstart} for s in sentences[sstart:send]]
|
|
188
|
+
disamb_query = self._build_disamb_query(ctext, csentences, lang, minSelectorScore, maxTermFrequency, short_text)
|
|
189
|
+
cstarts[cid] = cstart
|
|
190
|
+
futures.append(self.fdsession.post(self.base_url + self.disamb_url, json=disamb_query, timeout=(30, 300)))
|
|
191
|
+
|
|
192
|
+
for future in as_completed(futures):
|
|
193
|
+
try:
|
|
194
|
+
resp = future.result()
|
|
195
|
+
if resp.ok:
|
|
196
|
+
result = resp.json()
|
|
197
|
+
rtext = result["shortText"] if short_text else result["text"]
|
|
198
|
+
cid = int(hashlib.md5(rtext.encode("utf-8")).hexdigest(), 16)
|
|
199
|
+
cstart = cstarts[cid]
|
|
200
|
+
ents = result.get("entities", [])
|
|
201
|
+
for ent in ents:
|
|
202
|
+
ent["offsetStart"] += cstart
|
|
203
|
+
ent["offsetEnd"] += cstart
|
|
204
|
+
entities.extend(ents)
|
|
205
|
+
else:
|
|
206
|
+
resp.raise_for_status()
|
|
207
|
+
except Exception:
|
|
208
|
+
logging.warning("An exception was thrown!", exc_info=True)
|
|
209
|
+
duration = time.time() - start
|
|
210
|
+
logger.debug("EF disamb duration with %d chunks %0.3fs", len(chunks), duration)
|
|
211
|
+
return entities
|
|
212
|
+
|
|
213
|
+
def get_kb_concept(self, qid: str) -> dict:
|
|
214
|
+
try:
|
|
215
|
+
resp = self.ksession.get(self.base_url + self.kb_url + qid)
|
|
216
|
+
if resp.ok:
|
|
217
|
+
return resp.json()
|
|
218
|
+
else:
|
|
219
|
+
resp.raise_for_status()
|
|
220
|
+
except Exception:
|
|
221
|
+
logging.warning("An exception was thrown!", exc_info=True)
|
|
222
|
+
return {}
|
|
223
|
+
|
|
224
|
+
def get_kb_concepts(self, qids: Iterable[str]) -> dict:
|
|
225
|
+
qids = list(qids)
|
|
226
|
+
start = time.time()
|
|
227
|
+
futures = [self.fksession.get(self.base_url + self.kb_url + qid) for qid in qids]
|
|
228
|
+
concepts = dict.fromkeys(qids)
|
|
229
|
+
for future in as_completed(futures):
|
|
230
|
+
try:
|
|
231
|
+
resp = future.result()
|
|
232
|
+
if resp.ok:
|
|
233
|
+
concept = resp.json()
|
|
234
|
+
if "wikidataId" in concept:
|
|
235
|
+
concepts[concept["wikidataId"]] = concept
|
|
236
|
+
else:
|
|
237
|
+
resp.raise_for_status()
|
|
238
|
+
except Exception:
|
|
239
|
+
logging.warning("An exception was thrown!", exc_info=True)
|
|
240
|
+
duration = time.time() - start
|
|
241
|
+
logger.debug("EF get kb %d concepts duration %0.3fs", len(qids), duration)
|
|
242
|
+
return concepts
|
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from collections.abc import Iterable
|
|
7
|
+
from typing import cast
|
|
8
|
+
|
|
9
|
+
import mongoquery
|
|
10
|
+
from collections_extended import RangeMap
|
|
11
|
+
from pydantic import BaseModel, Field
|
|
12
|
+
from pymultirole_plugins.util import comma_separated_to_list
|
|
13
|
+
from pymultirole_plugins.v1.annotator import AnnotatorBase, AnnotatorParameters
|
|
14
|
+
from pymultirole_plugins.v1.schema import Annotation, Document, Term
|
|
15
|
+
|
|
16
|
+
from pyannotators_entityfishing.ef_client import EntityFishingClient
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger("ef-annotator")
|
|
19
|
+
|
|
20
|
+
SUPPORTED_LANGUAGES = "en,fr,de,es,it,pt,ar,fa,ja,zh,ru,uk,sv,bn,hi,ko,nl,pl"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class EntityFishingParameters(AnnotatorParameters):
|
|
24
|
+
ef_uri: str = Field(
|
|
25
|
+
os.getenv("APP_EF_URI", "https://sherpa-entityfishing.kairntech.com"),
|
|
26
|
+
description="Base URL of the entity-fishing service",
|
|
27
|
+
json_schema_extra={"extra": "advanced"},
|
|
28
|
+
)
|
|
29
|
+
ef_pool_size: int = Field(
|
|
30
|
+
8,
|
|
31
|
+
description="Number of parallel HTTP connections to entity-fishing",
|
|
32
|
+
json_schema_extra={"extra": "advanced"},
|
|
33
|
+
)
|
|
34
|
+
ef_full: bool = Field(
|
|
35
|
+
False,
|
|
36
|
+
description="Request full concept data from entity-fishing (avoids separate KB lookups)",
|
|
37
|
+
json_schema_extra={"extra": "advanced"},
|
|
38
|
+
)
|
|
39
|
+
minSelectorScore: float = Field(
|
|
40
|
+
0.3,
|
|
41
|
+
description="Minimum selector threshold to retain a concept",
|
|
42
|
+
)
|
|
43
|
+
maxTermFrequency: float | None = Field(
|
|
44
|
+
None,
|
|
45
|
+
description="""Maximum term frequency above which terms are skipped.<br/>
|
|
46
|
+
Expressed as Zipf (typically 0-8). Lower values speed up processing but may miss entities.""",
|
|
47
|
+
)
|
|
48
|
+
mapped_labels: dict[str, str] | None = Field(
|
|
49
|
+
None,
|
|
50
|
+
description="Mapping from label name to a JSON mongo-query expression matching Wikidata concept properties",
|
|
51
|
+
json_schema_extra={"extra": "key:label,val:json"},
|
|
52
|
+
)
|
|
53
|
+
default_label: str | None = Field(
|
|
54
|
+
None,
|
|
55
|
+
description="Default label for unmapped concepts (if None, unmapped concepts are ignored)",
|
|
56
|
+
json_schema_extra={"extra": "label"},
|
|
57
|
+
)
|
|
58
|
+
output_labels: list[str] | None = Field(
|
|
59
|
+
None,
|
|
60
|
+
description="Subset of labels to keep in the output",
|
|
61
|
+
json_schema_extra={"extra": "label"},
|
|
62
|
+
)
|
|
63
|
+
noun_forms_only: bool = Field(
|
|
64
|
+
False,
|
|
65
|
+
description="Filter out terms that do not include at least one noun or proper name (requires spacy)",
|
|
66
|
+
json_schema_extra={"extra": "advanced"},
|
|
67
|
+
)
|
|
68
|
+
fingerprint: str | None = Field(
|
|
69
|
+
None,
|
|
70
|
+
description="Comma-separated list of Wikidata properties to consider for the fingerprint",
|
|
71
|
+
json_schema_extra={"extra": "advanced"},
|
|
72
|
+
)
|
|
73
|
+
wikidata_properties: str | None = Field(
|
|
74
|
+
None,
|
|
75
|
+
description="Comma-separated list of Wikidata properties to retrieve",
|
|
76
|
+
json_schema_extra={"extra": "advanced"},
|
|
77
|
+
)
|
|
78
|
+
do_chunking: bool = Field(
|
|
79
|
+
False,
|
|
80
|
+
description="Split input document into chunks and process them in parallel",
|
|
81
|
+
json_schema_extra={"extra": "advanced"},
|
|
82
|
+
)
|
|
83
|
+
short_text: bool = Field(
|
|
84
|
+
False,
|
|
85
|
+
description="Use short-text disambiguation mode (for search queries or titles)",
|
|
86
|
+
json_schema_extra={"extra": "advanced"},
|
|
87
|
+
)
|
|
88
|
+
multivalued_props: bool = Field(
|
|
89
|
+
False,
|
|
90
|
+
description="Use all property values in fingerprint (default keeps only one)",
|
|
91
|
+
json_schema_extra={"extra": "advanced"},
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _get_spacy_tagger(lang: str):
|
|
96
|
+
"""Lazily load a spacy model for POS tagging."""
|
|
97
|
+
try:
|
|
98
|
+
import spacy
|
|
99
|
+
except ImportError as e:
|
|
100
|
+
raise ImportError("spacy is required when noun_forms_only=True. Install with: pip install spacy") from e
|
|
101
|
+
|
|
102
|
+
model_map = {
|
|
103
|
+
"en": "en_core_web_sm",
|
|
104
|
+
"fr": "fr_core_news_sm",
|
|
105
|
+
"de": "de_core_news_sm",
|
|
106
|
+
"es": "es_core_news_sm",
|
|
107
|
+
"it": "it_core_news_sm",
|
|
108
|
+
"pt": "pt_core_news_sm",
|
|
109
|
+
"nl": "nl_core_news_sm",
|
|
110
|
+
"zh": "zh_core_web_sm",
|
|
111
|
+
"ja": "ja_core_news_sm",
|
|
112
|
+
"ru": "ru_core_news_sm",
|
|
113
|
+
"pl": "pl_core_news_sm",
|
|
114
|
+
}
|
|
115
|
+
model_name = model_map.get(lang)
|
|
116
|
+
if model_name is None:
|
|
117
|
+
raise ValueError(f"No spacy model configured for language '{lang}'. Disable noun_forms_only or add a model mapping.")
|
|
118
|
+
try:
|
|
119
|
+
return spacy.load(model_name, disable=["parser", "ner", "lemmatizer"])
|
|
120
|
+
except OSError as e:
|
|
121
|
+
raise OSError(f"Spacy model '{model_name}' not found. Install with: python -m spacy download {model_name}") from e
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _document_language(doc: Document, default: str | None = None) -> str | None:
|
|
125
|
+
if doc.metadata is not None and "language" in doc.metadata:
|
|
126
|
+
return doc.metadata["language"]
|
|
127
|
+
return default
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _filter_annotations(annotations: Iterable[Annotation], longest_match: bool = True) -> list[Annotation]:
|
|
131
|
+
"""Filter annotations and remove overlaps. Longest span wins."""
|
|
132
|
+
if not longest_match:
|
|
133
|
+
return list(annotations)
|
|
134
|
+
|
|
135
|
+
sorted_annotations = sorted(annotations, key=lambda ann: (ann.end - ann.start, -ann.start), reverse=True)
|
|
136
|
+
result = []
|
|
137
|
+
seen_offsets = RangeMap()
|
|
138
|
+
for ann in sorted_annotations:
|
|
139
|
+
if seen_offsets.get(ann.start) is None and seen_offsets.get(ann.end - 1) is None:
|
|
140
|
+
result.append(ann)
|
|
141
|
+
seen_offsets[ann.start : ann.end] = ann
|
|
142
|
+
return sorted(result, key=lambda ann: ann.start)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _enrich_concepts_with_properties(concepts: dict, wiki_props: list[str], multivalued_props: bool = False): # noqa: ARG001
|
|
146
|
+
"""Enrich concept dicts with extracted wikidata_properties."""
|
|
147
|
+
for concept in concepts.values():
|
|
148
|
+
if concept is not None and "statements" in concept:
|
|
149
|
+
fingerprint = defaultdict(list)
|
|
150
|
+
for wp in wiki_props:
|
|
151
|
+
fingerprint[wp] = []
|
|
152
|
+
for st in concept["statements"]:
|
|
153
|
+
if st["propertyId"] in wiki_props:
|
|
154
|
+
fingerprint[st["propertyId"]].append(st["value"])
|
|
155
|
+
if sum(len(v) for v in fingerprint.values()) > 0:
|
|
156
|
+
concept["wikidata_properties"] = dict(fingerprint)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _map_concepts_to_labels(concepts: dict, mapping: dict) -> dict:
|
|
160
|
+
"""Apply mongoquery mappings to assign labels to wikidata concepts."""
|
|
161
|
+
mapped = {}
|
|
162
|
+
for key, condition in mapping.items():
|
|
163
|
+
query = mongoquery.Query(condition)
|
|
164
|
+
for qid, concept in concepts.items():
|
|
165
|
+
if concept is not None and query.match(concept):
|
|
166
|
+
mapped[qid] = key
|
|
167
|
+
return mapped
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _build_annotation(entity: dict, label: str, concept: dict | None, lang: str, fingerprints: list[str], wikidata_properties: list[str], multivalued_props: bool) -> Annotation:
|
|
171
|
+
"""Build an Annotation from an entity-fishing entity and its resolved concept."""
|
|
172
|
+
qid = entity["wikidataId"]
|
|
173
|
+
ann_term = Term(
|
|
174
|
+
identifier=qid,
|
|
175
|
+
lexicon="wikidata",
|
|
176
|
+
score=entity.get("confidence_score", 1.0),
|
|
177
|
+
properties={},
|
|
178
|
+
)
|
|
179
|
+
ann_term.properties["wikidataId"] = qid
|
|
180
|
+
|
|
181
|
+
if concept and "wikidata_properties" in concept:
|
|
182
|
+
if fingerprints:
|
|
183
|
+
fp_parts = []
|
|
184
|
+
for f in fingerprints:
|
|
185
|
+
fvals = concept["wikidata_properties"].get(f, [])
|
|
186
|
+
if fvals:
|
|
187
|
+
fingers = [f"{f}:{v}" for v in sorted(fvals) if isinstance(v, str)]
|
|
188
|
+
if multivalued_props:
|
|
189
|
+
fp_parts.extend(fingers)
|
|
190
|
+
elif fingers:
|
|
191
|
+
fp_parts.append(fingers[0])
|
|
192
|
+
ann_term.properties["fingerprint"] = ",".join(fp_parts)
|
|
193
|
+
if wikidata_properties:
|
|
194
|
+
for wp in wikidata_properties:
|
|
195
|
+
fvals = concept["wikidata_properties"].get(wp, [])
|
|
196
|
+
if len(fvals) == 1:
|
|
197
|
+
ann_term.properties[wp] = fvals[0]
|
|
198
|
+
elif len(fvals) > 1:
|
|
199
|
+
ann_term.properties[wp] = fvals
|
|
200
|
+
|
|
201
|
+
# Preferred form: use multilingual term for the document language, else preferredTerm, else rawName
|
|
202
|
+
if "rawName" in entity:
|
|
203
|
+
ann_term.preferredForm = entity["rawName"]
|
|
204
|
+
if concept:
|
|
205
|
+
if "preferredTerm" in concept:
|
|
206
|
+
ann_term.preferredForm = concept["preferredTerm"]
|
|
207
|
+
if "multilingual" in concept:
|
|
208
|
+
multi_terms = {m["lang"]: m["term"] for m in concept["multilingual"]}
|
|
209
|
+
if lang in multi_terms:
|
|
210
|
+
ann_term.preferredForm = multi_terms[lang]
|
|
211
|
+
if "wikipediaExternalRef" in entity:
|
|
212
|
+
ann_term.properties["wikipediaExternalRef"] = str(entity["wikipediaExternalRef"])
|
|
213
|
+
|
|
214
|
+
return Annotation(
|
|
215
|
+
start=entity["offsetStart"],
|
|
216
|
+
end=entity["offsetEnd"],
|
|
217
|
+
labelName=label,
|
|
218
|
+
score=entity.get("confidence_score", 1.0),
|
|
219
|
+
terms=[ann_term],
|
|
220
|
+
properties=None,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _check_noun_form(spacy_doc, entity: dict) -> bool:
|
|
225
|
+
"""Check if the entity span contains at least one noun or proper noun."""
|
|
226
|
+
from spacy.parts_of_speech import NOUN, NUM, PROPN, PUNCT
|
|
227
|
+
|
|
228
|
+
span = spacy_doc.char_span(entity["offsetStart"], entity["offsetEnd"])
|
|
229
|
+
if span is None:
|
|
230
|
+
return False
|
|
231
|
+
pos_set = {PUNCT if t.is_punct else NUM if t.is_digit else t.pos for t in span}
|
|
232
|
+
return bool(pos_set.intersection({NOUN, PROPN}))
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
class EntityFishingAnnotator(AnnotatorBase):
|
|
236
|
+
__doc__ = (
|
|
237
|
+
"""[entity-fishing](https://github.com/kermitt2/entity-fishing) annotator for named entity recognition and disambiguation against Wikidata.
|
|
238
|
+
#need-segments
|
|
239
|
+
#languages:"""
|
|
240
|
+
+ SUPPORTED_LANGUAGES
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
def annotate(self, documents: list[Document], parameters: AnnotatorParameters) -> list[Document]:
|
|
244
|
+
params: EntityFishingParameters = cast(EntityFishingParameters, parameters)
|
|
245
|
+
|
|
246
|
+
supported_languages = comma_separated_to_list(SUPPORTED_LANGUAGES)
|
|
247
|
+
|
|
248
|
+
# Parse parameters
|
|
249
|
+
mapping = {k: json.loads(v) for k, v in params.mapped_labels.items()} if params.mapped_labels else {}
|
|
250
|
+
fingerprints = comma_separated_to_list(params.fingerprint)
|
|
251
|
+
wp_list = comma_separated_to_list(params.wikidata_properties)
|
|
252
|
+
wiki_props = fingerprints + wp_list
|
|
253
|
+
do_chunking = params.do_chunking and not params.short_text
|
|
254
|
+
noun_forms_only = params.noun_forms_only
|
|
255
|
+
|
|
256
|
+
# Build the set of valid output classes
|
|
257
|
+
classes = list(mapping.keys())
|
|
258
|
+
if params.default_label:
|
|
259
|
+
classes.append(params.default_label)
|
|
260
|
+
if params.output_labels:
|
|
261
|
+
classes = [c for c in classes if c in params.output_labels]
|
|
262
|
+
|
|
263
|
+
if not classes:
|
|
264
|
+
# Nothing to produce: return documents untouched
|
|
265
|
+
return documents
|
|
266
|
+
|
|
267
|
+
# Create client
|
|
268
|
+
client = EntityFishingClient(
|
|
269
|
+
base_url=params.ef_uri,
|
|
270
|
+
pool_size=params.ef_pool_size,
|
|
271
|
+
full=params.ef_full,
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
# Validate languages
|
|
275
|
+
langs = []
|
|
276
|
+
for doc in documents:
|
|
277
|
+
lang = _document_language(doc)
|
|
278
|
+
if lang is None or lang not in supported_languages:
|
|
279
|
+
raise AttributeError(f"Metadata language '{lang}' is required and must be in {SUPPORTED_LANGUAGES}")
|
|
280
|
+
langs.append(lang)
|
|
281
|
+
|
|
282
|
+
# Load spacy taggers if needed
|
|
283
|
+
spacy_docs = None
|
|
284
|
+
if noun_forms_only:
|
|
285
|
+
spacy_docs = []
|
|
286
|
+
for doc, lang in zip(documents, langs, strict=True):
|
|
287
|
+
tagger = _get_spacy_tagger(lang)
|
|
288
|
+
spacy_docs.append(tagger(doc.text))
|
|
289
|
+
|
|
290
|
+
# Disambiguate: parallel batch if multiple docs, else single-doc (with optional chunking)
|
|
291
|
+
if len(documents) > 1 and not do_chunking:
|
|
292
|
+
all_results = client.disamb_queries(documents, langs, params.minSelectorScore, params.maxTermFrequency, params.short_text)
|
|
293
|
+
else:
|
|
294
|
+
all_results = []
|
|
295
|
+
for doc, lang in zip(documents, langs, strict=True):
|
|
296
|
+
sents = tuple((s.start, s.end) for s in doc.sentences) if doc.sentences else ((0, len(doc.text)),)
|
|
297
|
+
if do_chunking:
|
|
298
|
+
result = client.disamb_query_chunks(doc.text, lang, params.minSelectorScore, params.maxTermFrequency, sents, params.short_text)
|
|
299
|
+
else:
|
|
300
|
+
result = client.disamb_query(doc.text, lang, params.minSelectorScore, params.maxTermFrequency, sents, params.short_text)
|
|
301
|
+
all_results.append(result)
|
|
302
|
+
|
|
303
|
+
# Process results for each document
|
|
304
|
+
for doc_idx, (doc, lang, result) in enumerate(zip(documents, langs, all_results, strict=True)):
|
|
305
|
+
start = time.time()
|
|
306
|
+
entities = [e for e in result if "wikidataId" in e]
|
|
307
|
+
|
|
308
|
+
# Resolve concepts
|
|
309
|
+
if params.ef_full:
|
|
310
|
+
concepts = {e["wikidataId"]: e for e in entities}
|
|
311
|
+
else:
|
|
312
|
+
qids = {e["wikidataId"] for e in entities}
|
|
313
|
+
concepts = client.get_kb_concepts(qids) if qids else {}
|
|
314
|
+
|
|
315
|
+
# Enrich with wikidata properties
|
|
316
|
+
if wiki_props:
|
|
317
|
+
_enrich_concepts_with_properties(concepts, wiki_props, params.multivalued_props)
|
|
318
|
+
|
|
319
|
+
# Map concepts to labels
|
|
320
|
+
mapped_concepts = _map_concepts_to_labels(concepts, mapping)
|
|
321
|
+
|
|
322
|
+
# Build annotations
|
|
323
|
+
spacy_doc = spacy_docs[doc_idx] if spacy_docs else None
|
|
324
|
+
anns = []
|
|
325
|
+
for entity in entities:
|
|
326
|
+
qid = entity["wikidataId"]
|
|
327
|
+
label = mapped_concepts.get(qid, params.default_label)
|
|
328
|
+
if not label or label not in classes:
|
|
329
|
+
continue
|
|
330
|
+
concept = concepts.get(qid)
|
|
331
|
+
|
|
332
|
+
# Noun form filtering
|
|
333
|
+
if spacy_doc and noun_forms_only and not _check_noun_form(spacy_doc, entity):
|
|
334
|
+
continue
|
|
335
|
+
|
|
336
|
+
ann = _build_annotation(entity, label, concept, lang, fingerprints, wp_list, params.multivalued_props)
|
|
337
|
+
anns.append(ann)
|
|
338
|
+
|
|
339
|
+
doc.annotations = _filter_annotations(anns)
|
|
340
|
+
duration = time.time() - start
|
|
341
|
+
logger.debug("EF annotator doc %d chars done in %0.3fs", len(doc.text), duration)
|
|
342
|
+
|
|
343
|
+
return documents
|
|
344
|
+
|
|
345
|
+
@classmethod
|
|
346
|
+
def get_model(cls) -> type[BaseModel]:
|
|
347
|
+
return EntityFishingParameters
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pyannotators-entityfishing
|
|
3
|
+
Version: 0.6.3
|
|
4
|
+
Summary: Annotator based on entity-fishing
|
|
5
|
+
Project-URL: Homepage, https://github.com/oterrier/pyannotators_entityfishing/
|
|
6
|
+
Author-email: Olivier Terrier <olivier.terrier@kairntech.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Intended Audience :: Information Technology
|
|
11
|
+
Classifier: Intended Audience :: System Administrators
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Software Development
|
|
16
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
+
Requires-Python: >=3.12
|
|
20
|
+
Requires-Dist: collections-extended
|
|
21
|
+
Requires-Dist: mongoquery
|
|
22
|
+
Requires-Dist: pydantic<3.0,>=2.0
|
|
23
|
+
Requires-Dist: pymultirole-plugins<0.7.0,>=0.6.0
|
|
24
|
+
Requires-Dist: python-singleton-metaclasses
|
|
25
|
+
Requires-Dist: requests
|
|
26
|
+
Requires-Dist: requests-cache
|
|
27
|
+
Requires-Dist: requests-futures
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: bump2version; extra == 'dev'
|
|
30
|
+
Requires-Dist: pre-commit; extra == 'dev'
|
|
31
|
+
Provides-Extra: docs
|
|
32
|
+
Requires-Dist: jupyter-sphinx; extra == 'docs'
|
|
33
|
+
Requires-Dist: lxml-html-clean; extra == 'docs'
|
|
34
|
+
Requires-Dist: m2r2; extra == 'docs'
|
|
35
|
+
Requires-Dist: sphinx; extra == 'docs'
|
|
36
|
+
Requires-Dist: sphinx-rtd-theme; extra == 'docs'
|
|
37
|
+
Requires-Dist: sphinxcontrib-apidoc; extra == 'docs'
|
|
38
|
+
Provides-Extra: spacy
|
|
39
|
+
Requires-Dist: spacy>=3.0; extra == 'spacy'
|
|
40
|
+
Provides-Extra: test
|
|
41
|
+
Requires-Dist: dirty-equals; extra == 'test'
|
|
42
|
+
Requires-Dist: pytest; extra == 'test'
|
|
43
|
+
Requires-Dist: pytest-cov; extra == 'test'
|
|
44
|
+
Requires-Dist: ruff; extra == 'test'
|
|
45
|
+
Description-Content-Type: text/markdown
|
|
46
|
+
|
|
47
|
+
# pyannotators-entityfishing
|
|
48
|
+
|
|
49
|
+
Annotator based on [entity-fishing](https://github.com/kermitt2/entity-fishing) for named entity recognition and disambiguation against Wikidata.
|
|
50
|
+
|
|
51
|
+
## Installation
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install pyannotators-entityfishing
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
For noun-form filtering (optional):
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install pyannotators-entityfishing[spacy]
|
|
61
|
+
python -m spacy download en_core_web_sm # or other language models
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Usage
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
from pymultirole_plugins.v1.schema import Document
|
|
68
|
+
from pyannotators_entityfishing.entityfishing import EntityFishingAnnotator, EntityFishingParameters
|
|
69
|
+
|
|
70
|
+
annotator = EntityFishingAnnotator()
|
|
71
|
+
parameters = EntityFishingParameters(
|
|
72
|
+
default_label="ENTITY",
|
|
73
|
+
minSelectorScore=0.3,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
docs = annotator.annotate(
|
|
77
|
+
[Document(text="Albert Einstein was born in Ulm.", metadata={"language": "en"})],
|
|
78
|
+
parameters,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
for ann in docs[0].annotations:
|
|
82
|
+
print(f"{ann.start}:{ann.end} {ann.labelName} {ann.terms[0].identifier}")
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Development
|
|
86
|
+
|
|
87
|
+
Install test dependencies:
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
uv pip install -e ".[test]"
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Linting
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
uv run ruff check src/ tests/
|
|
97
|
+
uv run ruff format --check src/ tests/
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### Testing
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
uv run pytest
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Coverage
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
uv run pytest --cov=src --cov-report=term-missing
|
|
110
|
+
```
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
pyannotators_entityfishing/__init__.py,sha256=y0SVZb5M4ba_bmzuoYxi0ElC6IjH9BvSj_NWoj_Oozo,63
|
|
2
|
+
pyannotators_entityfishing/ef_client.py,sha256=PsefEjekG5znoHlgIOU42BA_QfUeIrAV3VuvR9B_j5g,10780
|
|
3
|
+
pyannotators_entityfishing/entityfishing.py,sha256=r3sDDnQl9tl71-wBOdilHSnIDqyO5zM4c0gncz_uhWo,14139
|
|
4
|
+
pyannotators_entityfishing-0.6.3.dist-info/METADATA,sha256=k3TOmdvP12NAnoyc6979Y6j96fQSP8fux_-RXVNbgUQ,3114
|
|
5
|
+
pyannotators_entityfishing-0.6.3.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
6
|
+
pyannotators_entityfishing-0.6.3.dist-info/entry_points.txt,sha256=cvNE5rkDk8lFjc-nvac1AJpC47ua8yxfXgswkZ5cAh8,103
|
|
7
|
+
pyannotators_entityfishing-0.6.3.dist-info/RECORD,,
|