pyannotators-entityfishing 0.6.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ """Annotator based on entity-fishing"""
2
+
3
+ __version__ = "0.6.3"
@@ -0,0 +1,242 @@
1
+ import hashlib
2
+ import logging
3
+ import time
4
+ from collections.abc import Iterable
5
+ from concurrent.futures import as_completed
6
+ from datetime import timedelta
7
+
8
+ import requests
9
+ from collections_extended import RangeMap
10
+ from pymultirole_plugins.v1.schema import Document
11
+ from requests.adapters import DEFAULT_POOLSIZE, HTTPAdapter
12
+ from requests_cache import CachedSession
13
+ from requests_futures.sessions import FuturesSession
14
+
15
+ logger = logging.getLogger("ef-client")
16
+
17
+ DEFAULT_CHUNK_SIZE = 1000
18
+
19
+
20
+ class EntityFishingClient:
21
+ def __init__(
22
+ self,
23
+ base_url: str = "https://sherpa-entityfishing.kairntech.com",
24
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
25
+ cache_disamb: int = -1,
26
+ cache_concept: int = -1,
27
+ pool_size: int = 8,
28
+ full: bool = False,
29
+ ):
30
+ self.base_url = base_url.rstrip("/")
31
+ self.chunk_size = chunk_size
32
+ self.full = full
33
+
34
+ self.dsession = (
35
+ CachedSession(
36
+ "file:disamb_cache?mode=memory&cache=shared",
37
+ backend="sqlite",
38
+ uri=True,
39
+ wal=True,
40
+ cache_control=True,
41
+ expire_after=timedelta(seconds=cache_disamb),
42
+ allowable_methods=["POST"],
43
+ )
44
+ if cache_disamb > 0
45
+ else requests.Session()
46
+ )
47
+ self.dsession.mount("http://", HTTPAdapter(pool_maxsize=max(DEFAULT_POOLSIZE, pool_size)))
48
+ self.dsession.mount("https://", HTTPAdapter(pool_maxsize=max(DEFAULT_POOLSIZE, pool_size)))
49
+ self.dsession.headers.update({"Content-Type": "application/json", "Accept": "application/json"})
50
+ self.dsession.verify = False
51
+ self.fdsession = FuturesSession(session=self.dsession, max_workers=pool_size)
52
+
53
+ self.ksession = (
54
+ CachedSession(
55
+ "file:concept_cache?mode=memory&cache=shared",
56
+ backend="sqlite",
57
+ uri=True,
58
+ wal=True,
59
+ cache_control=True,
60
+ expire_after=timedelta(seconds=cache_concept),
61
+ allowable_methods=["GET"],
62
+ )
63
+ if cache_concept > 0
64
+ else requests.Session()
65
+ )
66
+ self.ksession.mount("http://", HTTPAdapter(pool_maxsize=max(DEFAULT_POOLSIZE, pool_size)))
67
+ self.ksession.mount("https://", HTTPAdapter(pool_maxsize=max(DEFAULT_POOLSIZE, pool_size)))
68
+ self.ksession.headers.update({"Content-Type": "application/json", "Accept": "application/json"})
69
+ self.ksession.verify = False
70
+ self.fksession = FuturesSession(session=self.ksession, max_workers=pool_size)
71
+
72
+ self.disamb_url = "/service/disambiguate/"
73
+ self.kb_url = "/service/kb/concept/"
74
+ self.term_url = "/service/kb/term/"
75
+
76
+ def _build_disamb_query(self, text: str, sentences: list[dict], lang: str, minSelectorScore: float, maxTermFrequency: float | None, short_text: bool = False) -> dict:
77
+ text = text.replace("\r\n", " \n")
78
+ disamb_query = {
79
+ "sentences": sentences,
80
+ "language": {"lang": lang},
81
+ "mentions": ["wikipedia"],
82
+ "nbest": short_text,
83
+ "sentence": False,
84
+ "full": self.full,
85
+ "customisation": "generic",
86
+ "minSelectorScore": minSelectorScore,
87
+ }
88
+ if short_text:
89
+ disamb_query["shortText"] = text
90
+ else:
91
+ disamb_query["text"] = text
92
+ if maxTermFrequency is not None and isinstance(maxTermFrequency, float):
93
+ disamb_query["maxTermFrequency"] = maxTermFrequency
94
+ return disamb_query
95
+
96
+ def disamb_query(self, text: str, lang: str, minSelectorScore: float, maxTermFrequency: float | None, sents: tuple, short_text: bool = False) -> list:
97
+ sentences = [{"offsetStart": s[0], "offsetEnd": s[1]} for s in sents]
98
+ disamb_query = self._build_disamb_query(text, sentences, lang, minSelectorScore, maxTermFrequency, short_text)
99
+ try:
100
+ start = time.time()
101
+ resp = self.dsession.post(self.base_url + self.disamb_url, json=disamb_query, timeout=(30, 300))
102
+ duration = time.time() - start
103
+ logger.debug("EF disamb duration %0.3fs", duration)
104
+ if resp.ok:
105
+ result = resp.json()
106
+ return result.get("entities", [])
107
+ else:
108
+ resp.raise_for_status()
109
+ except Exception:
110
+ logging.warning("An exception was thrown!", exc_info=True)
111
+ return []
112
+
113
+ def disamb_queries(self, inputs: list[Document], langs: list[str], minSelectorScore: float, maxTermFrequency: float | None, short_text: bool = False) -> list[list]:
114
+ futures = []
115
+ results: list[list] = [[] for _ in range(len(inputs))]
116
+ start = time.time()
117
+
118
+ for idx, (doc, lang) in enumerate(zip(inputs, langs, strict=True)):
119
+ sents = [(s.start, s.end) for s in doc.sentences] if doc.sentences else [(0, len(doc.text))]
120
+ sentences = [{"offsetStart": s[0], "offsetEnd": s[1]} for s in sents]
121
+ disamb_query = self._build_disamb_query(doc.text, sentences, lang, minSelectorScore, maxTermFrequency, short_text)
122
+ future = self.fdsession.post(self.base_url + self.disamb_url, json=disamb_query, timeout=(30, 300))
123
+ future.idx = idx
124
+ futures.append(future)
125
+
126
+ for future in as_completed(futures):
127
+ try:
128
+ resp = future.result()
129
+ if resp.ok:
130
+ result = resp.json()
131
+ results[future.idx] = result.get("entities", [])
132
+ else:
133
+ resp.raise_for_status()
134
+ except Exception:
135
+ logging.warning("An exception was thrown!", exc_info=True)
136
+ duration = time.time() - start
137
+ logger.debug("EF disamb duration with %d docs %0.3fs", len(inputs), duration)
138
+ return results
139
+
140
+ @staticmethod
141
+ def group_sentences(text: str, sentences: list[dict], chunk_size: int = DEFAULT_CHUNK_SIZE):
142
+ chunks = RangeMap()
143
+ for sent in sentences:
144
+ if sent["offsetStart"] < sent["offsetEnd"]:
145
+ chunks[sent["offsetStart"] : sent["offsetEnd"]] = sent
146
+ cstart = 0
147
+ cend = 0
148
+ sstart = 0
149
+ while cend < len(text):
150
+ ranges = chunks.get_range(cstart, cstart + chunk_size)
151
+ if ranges.start is None or ranges.end is None:
152
+ break
153
+ send = sstart + len(ranges)
154
+ first_sent = ranges[ranges.start]
155
+ last_sent = ranges[ranges.end - 1]
156
+ if first_sent is not None and "offsetStart" in first_sent:
157
+ cstart = first_sent["offsetStart"]
158
+ if last_sent is not None and "offsetEnd" in last_sent:
159
+ cend = last_sent["offsetEnd"]
160
+ yield (cstart, cend, sstart, send)
161
+ cstart = cend
162
+ sstart = send
163
+
164
+ @staticmethod
165
+ def chunk_sentences(text: str, sentences: list[dict], chunk_size: int = DEFAULT_CHUNK_SIZE) -> list[tuple]:
166
+ chunks = list(EntityFishingClient.group_sentences(text, sentences, chunk_size))
167
+ # If last chunk is too small aggregate it to the previous
168
+ if len(chunks) > 1 and (chunks[-1][1] - chunks[-1][0]) < chunk_size / 10:
169
+ last_chunk = chunks.pop()
170
+ chunks[-1] = (chunks[-1][0], last_chunk[1], chunks[-1][2], last_chunk[3])
171
+ return chunks
172
+
173
+ def disamb_query_chunks(self, text: str, lang: str, minSelectorScore: float, maxTermFrequency: float | None, sents: tuple, short_text: bool = False) -> list:
174
+ sentences = [{"offsetStart": s[0], "offsetEnd": s[1]} for s in sents]
175
+ if self.chunk_size < 0:
176
+ return self.disamb_query(text, lang, minSelectorScore, maxTermFrequency, sents, short_text)
177
+
178
+ start = time.time()
179
+ text = text.replace("\r\n", " \n")
180
+ entities = []
181
+ chunks = self.chunk_sentences(text, sentences, self.chunk_size)
182
+ futures = []
183
+ cstarts = {}
184
+ for cstart, cend, sstart, send in chunks:
185
+ ctext = text[cstart:cend]
186
+ cid = int(hashlib.md5(ctext.encode("utf-8")).hexdigest(), 16)
187
+ csentences = [{"offsetStart": s["offsetStart"] - cstart, "offsetEnd": s["offsetEnd"] - cstart} for s in sentences[sstart:send]]
188
+ disamb_query = self._build_disamb_query(ctext, csentences, lang, minSelectorScore, maxTermFrequency, short_text)
189
+ cstarts[cid] = cstart
190
+ futures.append(self.fdsession.post(self.base_url + self.disamb_url, json=disamb_query, timeout=(30, 300)))
191
+
192
+ for future in as_completed(futures):
193
+ try:
194
+ resp = future.result()
195
+ if resp.ok:
196
+ result = resp.json()
197
+ rtext = result["shortText"] if short_text else result["text"]
198
+ cid = int(hashlib.md5(rtext.encode("utf-8")).hexdigest(), 16)
199
+ cstart = cstarts[cid]
200
+ ents = result.get("entities", [])
201
+ for ent in ents:
202
+ ent["offsetStart"] += cstart
203
+ ent["offsetEnd"] += cstart
204
+ entities.extend(ents)
205
+ else:
206
+ resp.raise_for_status()
207
+ except Exception:
208
+ logging.warning("An exception was thrown!", exc_info=True)
209
+ duration = time.time() - start
210
+ logger.debug("EF disamb duration with %d chunks %0.3fs", len(chunks), duration)
211
+ return entities
212
+
213
+ def get_kb_concept(self, qid: str) -> dict:
214
+ try:
215
+ resp = self.ksession.get(self.base_url + self.kb_url + qid)
216
+ if resp.ok:
217
+ return resp.json()
218
+ else:
219
+ resp.raise_for_status()
220
+ except Exception:
221
+ logging.warning("An exception was thrown!", exc_info=True)
222
+ return {}
223
+
224
+ def get_kb_concepts(self, qids: Iterable[str]) -> dict:
225
+ qids = list(qids)
226
+ start = time.time()
227
+ futures = [self.fksession.get(self.base_url + self.kb_url + qid) for qid in qids]
228
+ concepts = dict.fromkeys(qids)
229
+ for future in as_completed(futures):
230
+ try:
231
+ resp = future.result()
232
+ if resp.ok:
233
+ concept = resp.json()
234
+ if "wikidataId" in concept:
235
+ concepts[concept["wikidataId"]] = concept
236
+ else:
237
+ resp.raise_for_status()
238
+ except Exception:
239
+ logging.warning("An exception was thrown!", exc_info=True)
240
+ duration = time.time() - start
241
+ logger.debug("EF get kb %d concepts duration %0.3fs", len(qids), duration)
242
+ return concepts
@@ -0,0 +1,347 @@
1
+ import json
2
+ import logging
3
+ import os
4
+ import time
5
+ from collections import defaultdict
6
+ from collections.abc import Iterable
7
+ from typing import cast
8
+
9
+ import mongoquery
10
+ from collections_extended import RangeMap
11
+ from pydantic import BaseModel, Field
12
+ from pymultirole_plugins.util import comma_separated_to_list
13
+ from pymultirole_plugins.v1.annotator import AnnotatorBase, AnnotatorParameters
14
+ from pymultirole_plugins.v1.schema import Annotation, Document, Term
15
+
16
+ from pyannotators_entityfishing.ef_client import EntityFishingClient
17
+
18
+ logger = logging.getLogger("ef-annotator")
19
+
20
+ SUPPORTED_LANGUAGES = "en,fr,de,es,it,pt,ar,fa,ja,zh,ru,uk,sv,bn,hi,ko,nl,pl"
21
+
22
+
23
+ class EntityFishingParameters(AnnotatorParameters):
24
+ ef_uri: str = Field(
25
+ os.getenv("APP_EF_URI", "https://sherpa-entityfishing.kairntech.com"),
26
+ description="Base URL of the entity-fishing service",
27
+ json_schema_extra={"extra": "advanced"},
28
+ )
29
+ ef_pool_size: int = Field(
30
+ 8,
31
+ description="Number of parallel HTTP connections to entity-fishing",
32
+ json_schema_extra={"extra": "advanced"},
33
+ )
34
+ ef_full: bool = Field(
35
+ False,
36
+ description="Request full concept data from entity-fishing (avoids separate KB lookups)",
37
+ json_schema_extra={"extra": "advanced"},
38
+ )
39
+ minSelectorScore: float = Field(
40
+ 0.3,
41
+ description="Minimum selector threshold to retain a concept",
42
+ )
43
+ maxTermFrequency: float | None = Field(
44
+ None,
45
+ description="""Maximum term frequency above which terms are skipped.<br/>
46
+ Expressed as Zipf (typically 0-8). Lower values speed up processing but may miss entities.""",
47
+ )
48
+ mapped_labels: dict[str, str] | None = Field(
49
+ None,
50
+ description="Mapping from label name to a JSON mongo-query expression matching Wikidata concept properties",
51
+ json_schema_extra={"extra": "key:label,val:json"},
52
+ )
53
+ default_label: str | None = Field(
54
+ None,
55
+ description="Default label for unmapped concepts (if None, unmapped concepts are ignored)",
56
+ json_schema_extra={"extra": "label"},
57
+ )
58
+ output_labels: list[str] | None = Field(
59
+ None,
60
+ description="Subset of labels to keep in the output",
61
+ json_schema_extra={"extra": "label"},
62
+ )
63
+ noun_forms_only: bool = Field(
64
+ False,
65
+ description="Filter out terms that do not include at least one noun or proper name (requires spacy)",
66
+ json_schema_extra={"extra": "advanced"},
67
+ )
68
+ fingerprint: str | None = Field(
69
+ None,
70
+ description="Comma-separated list of Wikidata properties to consider for the fingerprint",
71
+ json_schema_extra={"extra": "advanced"},
72
+ )
73
+ wikidata_properties: str | None = Field(
74
+ None,
75
+ description="Comma-separated list of Wikidata properties to retrieve",
76
+ json_schema_extra={"extra": "advanced"},
77
+ )
78
+ do_chunking: bool = Field(
79
+ False,
80
+ description="Split input document into chunks and process them in parallel",
81
+ json_schema_extra={"extra": "advanced"},
82
+ )
83
+ short_text: bool = Field(
84
+ False,
85
+ description="Use short-text disambiguation mode (for search queries or titles)",
86
+ json_schema_extra={"extra": "advanced"},
87
+ )
88
+ multivalued_props: bool = Field(
89
+ False,
90
+ description="Use all property values in fingerprint (default keeps only one)",
91
+ json_schema_extra={"extra": "advanced"},
92
+ )
93
+
94
+
95
+ def _get_spacy_tagger(lang: str):
96
+ """Lazily load a spacy model for POS tagging."""
97
+ try:
98
+ import spacy
99
+ except ImportError as e:
100
+ raise ImportError("spacy is required when noun_forms_only=True. Install with: pip install spacy") from e
101
+
102
+ model_map = {
103
+ "en": "en_core_web_sm",
104
+ "fr": "fr_core_news_sm",
105
+ "de": "de_core_news_sm",
106
+ "es": "es_core_news_sm",
107
+ "it": "it_core_news_sm",
108
+ "pt": "pt_core_news_sm",
109
+ "nl": "nl_core_news_sm",
110
+ "zh": "zh_core_web_sm",
111
+ "ja": "ja_core_news_sm",
112
+ "ru": "ru_core_news_sm",
113
+ "pl": "pl_core_news_sm",
114
+ }
115
+ model_name = model_map.get(lang)
116
+ if model_name is None:
117
+ raise ValueError(f"No spacy model configured for language '{lang}'. Disable noun_forms_only or add a model mapping.")
118
+ try:
119
+ return spacy.load(model_name, disable=["parser", "ner", "lemmatizer"])
120
+ except OSError as e:
121
+ raise OSError(f"Spacy model '{model_name}' not found. Install with: python -m spacy download {model_name}") from e
122
+
123
+
124
+ def _document_language(doc: Document, default: str | None = None) -> str | None:
125
+ if doc.metadata is not None and "language" in doc.metadata:
126
+ return doc.metadata["language"]
127
+ return default
128
+
129
+
130
+ def _filter_annotations(annotations: Iterable[Annotation], longest_match: bool = True) -> list[Annotation]:
131
+ """Filter annotations and remove overlaps. Longest span wins."""
132
+ if not longest_match:
133
+ return list(annotations)
134
+
135
+ sorted_annotations = sorted(annotations, key=lambda ann: (ann.end - ann.start, -ann.start), reverse=True)
136
+ result = []
137
+ seen_offsets = RangeMap()
138
+ for ann in sorted_annotations:
139
+ if seen_offsets.get(ann.start) is None and seen_offsets.get(ann.end - 1) is None:
140
+ result.append(ann)
141
+ seen_offsets[ann.start : ann.end] = ann
142
+ return sorted(result, key=lambda ann: ann.start)
143
+
144
+
145
+ def _enrich_concepts_with_properties(concepts: dict, wiki_props: list[str], multivalued_props: bool = False): # noqa: ARG001
146
+ """Enrich concept dicts with extracted wikidata_properties."""
147
+ for concept in concepts.values():
148
+ if concept is not None and "statements" in concept:
149
+ fingerprint = defaultdict(list)
150
+ for wp in wiki_props:
151
+ fingerprint[wp] = []
152
+ for st in concept["statements"]:
153
+ if st["propertyId"] in wiki_props:
154
+ fingerprint[st["propertyId"]].append(st["value"])
155
+ if sum(len(v) for v in fingerprint.values()) > 0:
156
+ concept["wikidata_properties"] = dict(fingerprint)
157
+
158
+
159
+ def _map_concepts_to_labels(concepts: dict, mapping: dict) -> dict:
160
+ """Apply mongoquery mappings to assign labels to wikidata concepts."""
161
+ mapped = {}
162
+ for key, condition in mapping.items():
163
+ query = mongoquery.Query(condition)
164
+ for qid, concept in concepts.items():
165
+ if concept is not None and query.match(concept):
166
+ mapped[qid] = key
167
+ return mapped
168
+
169
+
170
+ def _build_annotation(entity: dict, label: str, concept: dict | None, lang: str, fingerprints: list[str], wikidata_properties: list[str], multivalued_props: bool) -> Annotation:
171
+ """Build an Annotation from an entity-fishing entity and its resolved concept."""
172
+ qid = entity["wikidataId"]
173
+ ann_term = Term(
174
+ identifier=qid,
175
+ lexicon="wikidata",
176
+ score=entity.get("confidence_score", 1.0),
177
+ properties={},
178
+ )
179
+ ann_term.properties["wikidataId"] = qid
180
+
181
+ if concept and "wikidata_properties" in concept:
182
+ if fingerprints:
183
+ fp_parts = []
184
+ for f in fingerprints:
185
+ fvals = concept["wikidata_properties"].get(f, [])
186
+ if fvals:
187
+ fingers = [f"{f}:{v}" for v in sorted(fvals) if isinstance(v, str)]
188
+ if multivalued_props:
189
+ fp_parts.extend(fingers)
190
+ elif fingers:
191
+ fp_parts.append(fingers[0])
192
+ ann_term.properties["fingerprint"] = ",".join(fp_parts)
193
+ if wikidata_properties:
194
+ for wp in wikidata_properties:
195
+ fvals = concept["wikidata_properties"].get(wp, [])
196
+ if len(fvals) == 1:
197
+ ann_term.properties[wp] = fvals[0]
198
+ elif len(fvals) > 1:
199
+ ann_term.properties[wp] = fvals
200
+
201
+ # Preferred form: use multilingual term for the document language, else preferredTerm, else rawName
202
+ if "rawName" in entity:
203
+ ann_term.preferredForm = entity["rawName"]
204
+ if concept:
205
+ if "preferredTerm" in concept:
206
+ ann_term.preferredForm = concept["preferredTerm"]
207
+ if "multilingual" in concept:
208
+ multi_terms = {m["lang"]: m["term"] for m in concept["multilingual"]}
209
+ if lang in multi_terms:
210
+ ann_term.preferredForm = multi_terms[lang]
211
+ if "wikipediaExternalRef" in entity:
212
+ ann_term.properties["wikipediaExternalRef"] = str(entity["wikipediaExternalRef"])
213
+
214
+ return Annotation(
215
+ start=entity["offsetStart"],
216
+ end=entity["offsetEnd"],
217
+ labelName=label,
218
+ score=entity.get("confidence_score", 1.0),
219
+ terms=[ann_term],
220
+ properties=None,
221
+ )
222
+
223
+
224
+ def _check_noun_form(spacy_doc, entity: dict) -> bool:
225
+ """Check if the entity span contains at least one noun or proper noun."""
226
+ from spacy.parts_of_speech import NOUN, NUM, PROPN, PUNCT
227
+
228
+ span = spacy_doc.char_span(entity["offsetStart"], entity["offsetEnd"])
229
+ if span is None:
230
+ return False
231
+ pos_set = {PUNCT if t.is_punct else NUM if t.is_digit else t.pos for t in span}
232
+ return bool(pos_set.intersection({NOUN, PROPN}))
233
+
234
+
235
+ class EntityFishingAnnotator(AnnotatorBase):
236
+ __doc__ = (
237
+ """[entity-fishing](https://github.com/kermitt2/entity-fishing) annotator for named entity recognition and disambiguation against Wikidata.
238
+ #need-segments
239
+ #languages:"""
240
+ + SUPPORTED_LANGUAGES
241
+ )
242
+
243
+ def annotate(self, documents: list[Document], parameters: AnnotatorParameters) -> list[Document]:
244
+ params: EntityFishingParameters = cast(EntityFishingParameters, parameters)
245
+
246
+ supported_languages = comma_separated_to_list(SUPPORTED_LANGUAGES)
247
+
248
+ # Parse parameters
249
+ mapping = {k: json.loads(v) for k, v in params.mapped_labels.items()} if params.mapped_labels else {}
250
+ fingerprints = comma_separated_to_list(params.fingerprint)
251
+ wp_list = comma_separated_to_list(params.wikidata_properties)
252
+ wiki_props = fingerprints + wp_list
253
+ do_chunking = params.do_chunking and not params.short_text
254
+ noun_forms_only = params.noun_forms_only
255
+
256
+ # Build the set of valid output classes
257
+ classes = list(mapping.keys())
258
+ if params.default_label:
259
+ classes.append(params.default_label)
260
+ if params.output_labels:
261
+ classes = [c for c in classes if c in params.output_labels]
262
+
263
+ if not classes:
264
+ # Nothing to produce: return documents untouched
265
+ return documents
266
+
267
+ # Create client
268
+ client = EntityFishingClient(
269
+ base_url=params.ef_uri,
270
+ pool_size=params.ef_pool_size,
271
+ full=params.ef_full,
272
+ )
273
+
274
+ # Validate languages
275
+ langs = []
276
+ for doc in documents:
277
+ lang = _document_language(doc)
278
+ if lang is None or lang not in supported_languages:
279
+ raise AttributeError(f"Metadata language '{lang}' is required and must be in {SUPPORTED_LANGUAGES}")
280
+ langs.append(lang)
281
+
282
+ # Load spacy taggers if needed
283
+ spacy_docs = None
284
+ if noun_forms_only:
285
+ spacy_docs = []
286
+ for doc, lang in zip(documents, langs, strict=True):
287
+ tagger = _get_spacy_tagger(lang)
288
+ spacy_docs.append(tagger(doc.text))
289
+
290
+ # Disambiguate: parallel batch if multiple docs, else single-doc (with optional chunking)
291
+ if len(documents) > 1 and not do_chunking:
292
+ all_results = client.disamb_queries(documents, langs, params.minSelectorScore, params.maxTermFrequency, params.short_text)
293
+ else:
294
+ all_results = []
295
+ for doc, lang in zip(documents, langs, strict=True):
296
+ sents = tuple((s.start, s.end) for s in doc.sentences) if doc.sentences else ((0, len(doc.text)),)
297
+ if do_chunking:
298
+ result = client.disamb_query_chunks(doc.text, lang, params.minSelectorScore, params.maxTermFrequency, sents, params.short_text)
299
+ else:
300
+ result = client.disamb_query(doc.text, lang, params.minSelectorScore, params.maxTermFrequency, sents, params.short_text)
301
+ all_results.append(result)
302
+
303
+ # Process results for each document
304
+ for doc_idx, (doc, lang, result) in enumerate(zip(documents, langs, all_results, strict=True)):
305
+ start = time.time()
306
+ entities = [e for e in result if "wikidataId" in e]
307
+
308
+ # Resolve concepts
309
+ if params.ef_full:
310
+ concepts = {e["wikidataId"]: e for e in entities}
311
+ else:
312
+ qids = {e["wikidataId"] for e in entities}
313
+ concepts = client.get_kb_concepts(qids) if qids else {}
314
+
315
+ # Enrich with wikidata properties
316
+ if wiki_props:
317
+ _enrich_concepts_with_properties(concepts, wiki_props, params.multivalued_props)
318
+
319
+ # Map concepts to labels
320
+ mapped_concepts = _map_concepts_to_labels(concepts, mapping)
321
+
322
+ # Build annotations
323
+ spacy_doc = spacy_docs[doc_idx] if spacy_docs else None
324
+ anns = []
325
+ for entity in entities:
326
+ qid = entity["wikidataId"]
327
+ label = mapped_concepts.get(qid, params.default_label)
328
+ if not label or label not in classes:
329
+ continue
330
+ concept = concepts.get(qid)
331
+
332
+ # Noun form filtering
333
+ if spacy_doc and noun_forms_only and not _check_noun_form(spacy_doc, entity):
334
+ continue
335
+
336
+ ann = _build_annotation(entity, label, concept, lang, fingerprints, wp_list, params.multivalued_props)
337
+ anns.append(ann)
338
+
339
+ doc.annotations = _filter_annotations(anns)
340
+ duration = time.time() - start
341
+ logger.debug("EF annotator doc %d chars done in %0.3fs", len(doc.text), duration)
342
+
343
+ return documents
344
+
345
+ @classmethod
346
+ def get_model(cls) -> type[BaseModel]:
347
+ return EntityFishingParameters
@@ -0,0 +1,110 @@
1
+ Metadata-Version: 2.4
2
+ Name: pyannotators-entityfishing
3
+ Version: 0.6.3
4
+ Summary: Annotator based on entity-fishing
5
+ Project-URL: Homepage, https://github.com/oterrier/pyannotators_entityfishing/
6
+ Author-email: Olivier Terrier <olivier.terrier@kairntech.com>
7
+ License: MIT
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Intended Audience :: Information Technology
11
+ Classifier: Intended Audience :: System Administrators
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Software Development
16
+ Classifier: Topic :: Software Development :: Libraries
17
+ Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
18
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
19
+ Requires-Python: >=3.12
20
+ Requires-Dist: collections-extended
21
+ Requires-Dist: mongoquery
22
+ Requires-Dist: pydantic<3.0,>=2.0
23
+ Requires-Dist: pymultirole-plugins<0.7.0,>=0.6.0
24
+ Requires-Dist: python-singleton-metaclasses
25
+ Requires-Dist: requests
26
+ Requires-Dist: requests-cache
27
+ Requires-Dist: requests-futures
28
+ Provides-Extra: dev
29
+ Requires-Dist: bump2version; extra == 'dev'
30
+ Requires-Dist: pre-commit; extra == 'dev'
31
+ Provides-Extra: docs
32
+ Requires-Dist: jupyter-sphinx; extra == 'docs'
33
+ Requires-Dist: lxml-html-clean; extra == 'docs'
34
+ Requires-Dist: m2r2; extra == 'docs'
35
+ Requires-Dist: sphinx; extra == 'docs'
36
+ Requires-Dist: sphinx-rtd-theme; extra == 'docs'
37
+ Requires-Dist: sphinxcontrib-apidoc; extra == 'docs'
38
+ Provides-Extra: spacy
39
+ Requires-Dist: spacy>=3.0; extra == 'spacy'
40
+ Provides-Extra: test
41
+ Requires-Dist: dirty-equals; extra == 'test'
42
+ Requires-Dist: pytest; extra == 'test'
43
+ Requires-Dist: pytest-cov; extra == 'test'
44
+ Requires-Dist: ruff; extra == 'test'
45
+ Description-Content-Type: text/markdown
46
+
47
+ # pyannotators-entityfishing
48
+
49
+ Annotator based on [entity-fishing](https://github.com/kermitt2/entity-fishing) for named entity recognition and disambiguation against Wikidata.
50
+
51
+ ## Installation
52
+
53
+ ```bash
54
+ pip install pyannotators-entityfishing
55
+ ```
56
+
57
+ For noun-form filtering (optional):
58
+
59
+ ```bash
60
+ pip install pyannotators-entityfishing[spacy]
61
+ python -m spacy download en_core_web_sm # or other language models
62
+ ```
63
+
64
+ ## Usage
65
+
66
+ ```python
67
+ from pymultirole_plugins.v1.schema import Document
68
+ from pyannotators_entityfishing.entityfishing import EntityFishingAnnotator, EntityFishingParameters
69
+
70
+ annotator = EntityFishingAnnotator()
71
+ parameters = EntityFishingParameters(
72
+ default_label="ENTITY",
73
+ minSelectorScore=0.3,
74
+ )
75
+
76
+ docs = annotator.annotate(
77
+ [Document(text="Albert Einstein was born in Ulm.", metadata={"language": "en"})],
78
+ parameters,
79
+ )
80
+
81
+ for ann in docs[0].annotations:
82
+ print(f"{ann.start}:{ann.end} {ann.labelName} {ann.terms[0].identifier}")
83
+ ```
84
+
85
+ ## Development
86
+
87
+ Install test dependencies:
88
+
89
+ ```bash
90
+ uv pip install -e ".[test]"
91
+ ```
92
+
93
+ ### Linting
94
+
95
+ ```bash
96
+ uv run ruff check src/ tests/
97
+ uv run ruff format --check src/ tests/
98
+ ```
99
+
100
+ ### Testing
101
+
102
+ ```bash
103
+ uv run pytest
104
+ ```
105
+
106
+ ### Coverage
107
+
108
+ ```bash
109
+ uv run pytest --cov=src --cov-report=term-missing
110
+ ```
@@ -0,0 +1,7 @@
1
+ pyannotators_entityfishing/__init__.py,sha256=y0SVZb5M4ba_bmzuoYxi0ElC6IjH9BvSj_NWoj_Oozo,63
2
+ pyannotators_entityfishing/ef_client.py,sha256=PsefEjekG5znoHlgIOU42BA_QfUeIrAV3VuvR9B_j5g,10780
3
+ pyannotators_entityfishing/entityfishing.py,sha256=r3sDDnQl9tl71-wBOdilHSnIDqyO5zM4c0gncz_uhWo,14139
4
+ pyannotators_entityfishing-0.6.3.dist-info/METADATA,sha256=k3TOmdvP12NAnoyc6979Y6j96fQSP8fux_-RXVNbgUQ,3114
5
+ pyannotators_entityfishing-0.6.3.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
6
+ pyannotators_entityfishing-0.6.3.dist-info/entry_points.txt,sha256=cvNE5rkDk8lFjc-nvac1AJpC47ua8yxfXgswkZ5cAh8,103
7
+ pyannotators_entityfishing-0.6.3.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [pyannotators.plugins]
2
+ entityfishing = pyannotators_entityfishing.entityfishing:EntityFishingAnnotator