PyPI - OntoLearner - Versions diffs - 1.4.10__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend

OntoLearner 1.4.10py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

ontolearner/VERSION +1 -1
ontolearner/base/learner.py +41 -18
ontolearner/evaluation/metrics.py +72 -32
ontolearner/learner/__init__.py +3 -2
ontolearner/learner/label_mapper.py +5 -4
ontolearner/learner/llm.py +257 -0
ontolearner/learner/prompt.py +40 -5
ontolearner/learner/rag/__init__.py +14 -0
ontolearner/learner/{rag.py → rag/rag.py} +7 -2
ontolearner/learner/retriever/__init__.py +1 -1
ontolearner/learner/retriever/{llm_retriever.py → augmented_retriever.py} +48 -39
ontolearner/learner/retriever/learner.py +3 -4
ontolearner/learner/taxonomy_discovery/alexbek.py +632 -310
ontolearner/learner/taxonomy_discovery/skhnlp.py +216 -156
ontolearner/learner/text2onto/__init__.py +1 -1
ontolearner/learner/text2onto/alexbek.py +484 -1105
ontolearner/learner/text2onto/sbunlp.py +498 -493
ontolearner/ontology/biology.py +2 -3
ontolearner/ontology/chemistry.py +16 -18
ontolearner/ontology/ecology_environment.py +2 -3
ontolearner/ontology/general.py +4 -6
ontolearner/ontology/material_science_engineering.py +64 -45
ontolearner/ontology/medicine.py +2 -3
ontolearner/ontology/scholarly_knowledge.py +6 -9
ontolearner/processor.py +3 -3
ontolearner/text2onto/splitter.py +69 -6
{ontolearner-1.4.10.dist-info → ontolearner-1.5.0.dist-info}/METADATA +2 -2
{ontolearner-1.4.10.dist-info → ontolearner-1.5.0.dist-info}/RECORD +30 -29
{ontolearner-1.4.10.dist-info → ontolearner-1.5.0.dist-info}/WHEEL +1 -1
{ontolearner-1.4.10.dist-info → ontolearner-1.5.0.dist-info}/licenses/LICENSE +0 -0

ontolearner/learner/text2onto/sbunlp.py CHANGED Viewed

@@ -4,7 +4,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#      https://opensource.org/licenses/MIT
+#      https://opensource.org/licenses/MIT
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,587 +12,592 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import json
-import random
-import re
 import ast
 import gc
-from typing import Any, Dict, List, Optional, Set, Tuple
+import random
+import re
 from collections import defaultdict
+from typing import Any, DefaultDict, Dict, List, Optional, Set
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-from ...base import AutoLearner, AutoLLM
+from ...base import AutoLearner
-# -----------------------------------------------------------------------------
-# Concrete AutoLLM: local HF wrapper that follows the AutoLLM interface
-# -----------------------------------------------------------------------------
-class LocalAutoLLM(AutoLLM):
+class SBUNLPFewShotLearner(AutoLearner):
     """
-    Handles loading and generation for a Hugging Face Causal Language Model (Qwen/TinyLlama).
-    Uses 4-bit quantization for efficiency and greedy decoding by default.
+    Public API expected by the pipeline:
+      - `load(model_id=...)`
+      - `fit(train_data, task=..., ontologizer=...)`
+      - `predict(test_data, task=..., ontologizer=...)`
+    Expected input bundle format (train/test):
+      - "documents": list of dicts, each with keys: {"id", "title", "text"}
+      - "terms2docs": dict mapping term -> list of doc_ids
+      - "terms2types": optional dict mapping term -> list of types
+    Prediction output payload (pipeline wraps this):
+      - {"terms": [{"doc_id": str, "term": str}, ...],
+         "types": [{"doc_id": str, "type": str}, ...]}
     """
     def __init__(
-        self, label_mapper: Any = None, device: str = "cpu", token: str = ""
-    ) -> None:
-        super().__init__(label_mapper=label_mapper, device=device, token=token)
-        self.model = None
-        self.tokenizer = None
-    def load(
         self,
-        model_id: str,
+        llm_model_id: Optional[str] = None,
+        device: str = "cpu",
         load_in_4bit: bool = False,
-        dtype: str = "auto",
+        max_new_tokens: int = 256,
         trust_remote_code: bool = True,
-    ):
-        """Load tokenizer + model, applying 4-bit quantization if specified and possible."""
+    ) -> None:
+        """
+        Initialize the few-shot learner.
+        Args:
+            llm_model_id: Default HF model id to load if `load()` is called without one.
+            device: "cpu" or a CUDA device identifier (e.g. "cuda").
+            load_in_4bit: Whether to attempt 4-bit quantized loading (bitsandbytes).
+            max_new_tokens: Maximum tokens to generate per prompt.
+            retriever_model_id: Unused (kept for compatibility).
+            top_k: Unused (kept for compatibility).
+            trust_remote_code: Forwarded to HF loaders (use with caution).
+        """
+        super().__init__()
+        self.device = device
+        self.max_new_tokens = int(max_new_tokens)
-        # Determine the target data type (default to float32 for CPU, float16 for GPU)
-        torch_dtype_val = torch.float16 if torch.cuda.is_available() else torch.float32
+        self._default_model_id = llm_model_id
+        self._load_in_4bit_default = bool(load_in_4bit)
+        self._trust_remote_code_default = bool(trust_remote_code)
-        # Load the tokenizer
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            model_id, trust_remote_code=trust_remote_code
-        )
-        if self.tokenizer.pad_token is None:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
+        # HF objects
+        self.model: Optional[AutoModelForCausalLM] = None
+        self.tokenizer: Optional[AutoTokenizer] = None
+        self._is_loaded = False
+        self._loaded_model_id: Optional[str] = None
-        quant_config = None
+        # Cached few-shot example blocks built during `fit()`
+        self.few_shot_terms_block: str = ""
+        self.few_shot_types_block: str = ""
+    def load(self, model_id: Optional[str] = None, **kwargs: Any) -> None:
+        """
+        Load the underlying HF causal LM and tokenizer.
+        LearnerPipeline typically calls: `learner.load(model_id=llm_id)`.
+        Args:
+            model_id: HF model id. If None, uses `llm_model_id` from __init__.
+            **kwargs:
+                load_in_4bit: override default 4-bit loading.
+                trust_remote_code: override default trust_remote_code.
+        """
+        resolved_model_id = model_id or self._default_model_id
+        if not resolved_model_id:
+            raise ValueError(
+                f"No model_id provided to {self.__class__.__name__}.load() and no llm_model_id in __init__."
+            )
+        load_in_4bit = bool(kwargs.get("load_in_4bit", self._load_in_4bit_default))
+        trust_remote_code = bool(kwargs.get("trust_remote_code", self._trust_remote_code_default))
+        # Avoid re-loading same model
+        if self._is_loaded and self._loaded_model_id == resolved_model_id:
+            return
+        torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+        tokenizer = AutoTokenizer.from_pretrained(resolved_model_id, trust_remote_code=trust_remote_code)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        self.tokenizer = tokenizer
+        quantization_config = None
         if load_in_4bit:
-            # Configure BitsAndBytes for 4-bit loading
-            quant_config = BitsAndBytesConfig(
+            quantization_config = BitsAndBytesConfig(
                 load_in_4bit=True,
                 bnb_4bit_compute_dtype=torch.float16,
                 bnb_4bit_use_double_quant=True,
                 bnb_4bit_quant_type="nf4",
             )
-            if torch_dtype_val is None:
-                torch_dtype_val = torch.float16
+            torch_dtype = torch.float16
-        # Set device mapping (auto for multi-GPU or single GPU, explicit CPU otherwise)
         device_map = "auto" if (self.device != "cpu") else {"": "cpu"}
-        # Load the Causal Language Model
-        self.model = AutoModelForCausalLM.from_pretrained(
-            model_id,
+        model = AutoModelForCausalLM.from_pretrained(
+            resolved_model_id,
             device_map=device_map,
-            torch_dtype=torch_dtype_val,
-            quantization_config=quant_config,
+            torch_dtype=torch_dtype,
+            quantization_config=quantization_config,
             trust_remote_code=trust_remote_code,
         )
-        # Ensure model is on the correct device (redundant if device_map="auto" but safe)
         if self.device == "cpu":
-            self.model.to("cpu")
+            model.to("cpu")
-    def generate(
-        self,
-        inputs: List[str],
-        max_new_tokens: int = 64,
-        temperature: float = 0.0,
-        top_p: float = 1.0,
-    ) -> List[str]:
-        """Generate continuations for a list of prompts, returning only the generated part."""
-        if self.model is None or self.tokenizer is None:
-            raise RuntimeError("Model/tokenizer not loaded. Call .load() first.")
+        self.model = model
+        self._is_loaded = True
+        self._loaded_model_id = resolved_model_id
+    def _invert_terms_to_docs_mapping(self, terms_to_documents: Dict[str, List[str]]) -> Dict[str, List[str]]:
+        """
+        Convert term->docs mapping to doc->terms mapping.
-        # --- Generation Setup ---
-        # Tokenize batch (padding is essential for batch inference)
-        enc = self.tokenizer(inputs, return_tensors="pt", padding=True, truncation=True)
-        input_ids = enc["input_ids"]
-        attention_mask = enc["attention_mask"]
+        Args:
+            terms_to_documents: Mapping from term to list of document IDs.
-        # Move tensors to the model's device (e.g., cuda:0)
-        model_device = next(self.model.parameters()).device
-        input_ids = input_ids.to(model_device)
-        attention_mask = attention_mask.to(model_device)
+        Returns:
+            Mapping from document ID to list of terms associated with it.
+        """
+        document_to_terms: DefaultDict[str, List[str]] = defaultdict(list)
+        for term, document_ids in (terms_to_documents or {}).items():
+            for document_id in document_ids or []:
+                document_to_terms[str(document_id)].append(str(term))
+        return dict(document_to_terms)
-        # --- Generate ---
-        with torch.no_grad():
-            outputs = self.model.generate(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                max_new_tokens=max_new_tokens,
-                do_sample=(
-                    temperature > 0.0
-                ),  # Use greedy decoding if temperature is 0.0
-                temperature=temperature,
-                top_p=top_p,
-                pad_token_id=self.tokenizer.eos_token_id,
-            )
+    def _derive_document_to_types(
+        self,
+        terms_to_documents: Dict[str, List[str]],
+        terms_to_types: Optional[Dict[str, List[str]]],
+    ) -> Dict[str, List[str]]:
+        """
+        Derive doc->types mapping using (terms->docs) and (terms->types).
-        # --- Post-processing: Extract only the generated tail ---
-        decoded_outputs: List[str] = []
-        for i, output_ids in enumerate(outputs):
-            full_decoded_text = self.tokenizer.decode(
-                output_ids, skip_special_tokens=True
-            )
-            prompt_text = self.tokenizer.decode(input_ids[i], skip_special_tokens=True)
+        Args:
+            terms_to_documents: term -> [doc_id...]
+            terms_to_types: term -> [type...]
-            # Safely strip the prompt text from the full output
-            if full_decoded_text.startswith(prompt_text):
-                generated_tail = full_decoded_text[len(prompt_text) :].strip()
-            else:
-                # Fallback extraction (less robust if padding affects token indices)
-                prompt_len = input_ids.shape[1]
-                generated_tail = self.tokenizer.decode(
-                    output_ids[prompt_len:], skip_special_tokens=True
-                ).strip()
-            decoded_outputs.append(generated_tail)
+        Returns:
+            doc_id -> sorted list of unique types.
+        """
+        if not terms_to_types:
+            return {}
-        return decoded_outputs
+        document_to_types: DefaultDict[str, Set[str]] = defaultdict(set)
+        for term, document_ids in (terms_to_documents or {}).items():
+            candidate_types = terms_to_types.get(term, []) or []
+            for document_id in document_ids or []:
+                for candidate_type in candidate_types:
+                    if isinstance(candidate_type, str) and candidate_type.strip():
+                        document_to_types[str(document_id)].add(candidate_type.strip())
-# -----------------------------------------------------------------------------
-# Main Learner: SBUNLPFewShotLearner (Task A Text2Onto)
-# -----------------------------------------------------------------------------
-class SBUNLPFewShotLearner(AutoLearner):
-    """
-    Concrete learner implementing the Task A Text2Onto pipeline (Term and Type Extraction).
-    It uses Few-Shot prompts generated from training data for inference.
-    """
+        return {doc_id: sorted(list(type_set)) for doc_id, type_set in document_to_types.items()}
-    def __init__(self, model: Optional[AutoLLM] = None, device: str = "cpu"):
-        super().__init__()
-        # self.model is an instance of LocalAutoLLM
-        self.model = model or LocalAutoLLM(device=device)
-        self.device = device
-        # Cached in-memory prompt blocks built during the fit phase
-        self.fewshot_terms_block: str = ""
-        self.fewshot_types_block: str = ""
+    def _truncate_text(self, text: str, max_chars: int) -> str:
+        """
+        Truncate text to a maximum number of characters (adds an ellipsis when truncated).
+        Args:
+            text: Input text.
+            max_chars: Maximum characters to keep. If <= 0, returns the original text.
+        Returns:
+            Truncated or original text.
+        """
+        if not max_chars or max_chars <= 0 or not text:
+            return text or ""
+        return (text[:max_chars] + "…") if len(text) > max_chars else text
-    # --- Few-shot construction (terms) ---
-    def build_stratified_fewshot_prompt(
+    def build_few_shot_terms_block(
         self,
-        documents_path: str,
-        terms_path: str,
+        documents: List[Dict[str, Any]],
+        terms_to_documents: Dict[str, List[str]],
         sample_size: int = 28,
         seed: int = 123,
         max_chars_per_text: int = 1200,
     ) -> str:
         """
-        Builds the few-shot exemplar block for Term Extraction using stratified sampling.
+        Build and cache the few-shot block for term extraction.
+        Strategy:
+            - Create strata by associated terms (doc -> associated term list).
+            - Sample proportionally across strata.
+            - Deduplicate by document id and top up from remaining docs if needed.
+        Args:
+            documents: Documents with keys: {"id","title","text"}.
+            terms_to_documents: Mapping term -> list of doc IDs.
+            sample_size: Desired number of examples in the block.
+            seed: RNG seed (local to this call).
+            max_chars_per_text: Text truncation limit per example.
+        Returns:
+            The formatted few-shot example block string.
         """
-        random.seed(seed)
-        # Read documents (JSONL) into a list
-        corpus_documents: List[Dict[str, Any]] = []
-        with open(documents_path, "r", encoding="utf-8") as file_handle:
-            for line in file_handle:
-                if line.strip():
-                    corpus_documents.append(json.loads(line))
-        num_total_docs = len(corpus_documents)
-        num_sample_docs = min(sample_size, num_total_docs)
-        # Load the map of term -> [list of document IDs]
-        with open(terms_path, "r", encoding="utf-8") as file_handle:
-            term_to_doc_map = json.load(file_handle)
-        # Invert map: document ID -> [list of terms]
-        doc_id_to_terms_map = defaultdict(list)
-        for term, doc_ids in term_to_doc_map.items():
-            for doc_id in doc_ids:
-                doc_id_to_terms_map[doc_id].append(term)
-        # Define strata (groups of documents associated with specific terms)
-        strata_map = defaultdict(list)
-        for doc in corpus_documents:
-            doc_id = doc.get("id", "")
-            associated_terms = doc_id_to_terms_map.get(doc_id, ["no_term"])
+        rng = random.Random(seed)
+        document_to_terms = self._invert_terms_to_docs_mapping(terms_to_documents)
+        total_documents = len(documents)
+        target_sample_count = min(int(sample_size), total_documents)
+        strata: DefaultDict[str, List[Dict[str, Any]]] = defaultdict(list)
+        for document in documents:
+            document_id = str(document.get("id", ""))
+            associated_terms = document_to_terms.get(document_id, ["no_term"])
             for term in associated_terms:
-                strata_map[term].append(doc)
+                strata[str(term)].append(document)
-        # Perform proportional sampling across strata
         sampled_documents: List[Dict[str, Any]] = []
-        for term_str, stratum_docs in strata_map.items():
-            num_stratum_docs = len(stratum_docs)
-            if num_stratum_docs == 0:
+        for docs_in_stratum in strata.values():
+            if not docs_in_stratum:
                 continue
-            # Calculate proportional sample size
-            proportion = num_stratum_docs / num_total_docs
-            num_to_sample_from_stratum = int(num_sample_docs * proportion)
-            if num_to_sample_from_stratum > 0:
-                sampled_documents.extend(
-                    random.sample(
-                        stratum_docs, min(num_to_sample_from_stratum, num_stratum_docs)
-                    )
+            proportion = len(docs_in_stratum) / max(1, total_documents)
+            stratum_quota = int(target_sample_count * proportion)
+            if stratum_quota > 0:
+                sampled_documents.extend(rng.sample(docs_in_stratum, min(stratum_quota, len(docs_in_stratum))))
+        sampled_by_id = {str(d.get("id", "")): d for d in sampled_documents if d.get("id", "")}
+        final_documents = list(sampled_by_id.values())
+        if len(final_documents) > target_sample_count:
+            final_documents = rng.sample(final_documents, target_sample_count)
+        elif len(final_documents) < target_sample_count:
+            remaining_documents = [d for d in documents if str(d.get("id", "")) not in sampled_by_id]
+            additional_needed = min(target_sample_count - len(final_documents), len(remaining_documents))
+            if additional_needed > 0:
+                final_documents.extend(rng.sample(remaining_documents, additional_needed))
+        lines: List[str] = []
+        for document in final_documents:
+            document_id = str(document.get("id", ""))
+            title = str(document.get("title", ""))
+            text = self._truncate_text(str(document.get("text", "")), max_chars_per_text)
+            associated_terms = document_to_terms.get(document_id, [])
+            lines.append(
+                "Document ID: {doc_id}\n"
+                "Title: {title}\n"
+                "Text: {text}\n"
+                "Associated Terms: {terms}\n"
+                "----------------------------------------".format(
+                    doc_id=document_id,
+                    title=title,
+                    text=text,
+                    terms=associated_terms,
                 )
-        # Deduplicate sampled documents by ID and adjust count to exactly 'sample_size'
-        unique_docs_by_id = {}
-        for doc in sampled_documents:
-            unique_docs_by_id[doc.get("id", "")] = doc
-        final_sample_docs = list(unique_docs_by_id.values())
-        if len(final_sample_docs) > num_sample_docs:
-            final_sample_docs = random.sample(final_sample_docs, num_sample_docs)
-        elif len(final_sample_docs) < num_sample_docs:
-            remaining_docs = [
-                d for d in corpus_documents if d.get("id", "") not in unique_docs_by_id
-            ]
-            needed_count = min(
-                num_sample_docs - len(final_sample_docs), len(remaining_docs)
-            )
-            final_sample_docs.extend(random.sample(remaining_docs, needed_count))
-        # Format the few-shot exemplar text block
-        prompt_lines: List[str] = []
-        for doc in final_sample_docs:
-            doc_id = doc.get("id", "")
-            title = doc.get("title", "")
-            text = doc.get("text", "")
-            # Truncate text if it exceeds the maximum character limit
-            if max_chars_per_text and len(text) > max_chars_per_text:
-                text = text[:max_chars_per_text] + "…"
-            associated_terms = doc_id_to_terms_map.get(doc_id, [])
-            prompt_lines.append(
-                f"Document ID: {doc_id}\nTitle: {title}\nText: {text}\nAssociated Terms: {associated_terms}\n----------------------------------------"
             )
-        prompt_block = "\n".join(prompt_lines)
-        self.fewshot_terms_block = prompt_block
-        return prompt_block
+        self.few_shot_terms_block = "\n".join(lines)
+        return self.few_shot_terms_block
-    # --- Few-shot construction (types) ---
-    def build_types_fewshot_block(
+    def build_few_shot_types_block(
         self,
-        docs_jsonl: str,
-        terms2doc_json: str,
-        sample_per_term: int = 1,
-        full_word: bool = True,
-        case_sensitive: bool = True,
+        documents: List[Dict[str, Any]],
+        terms_to_documents: Dict[str, List[str]],
+        terms_to_types: Optional[Dict[str, List[str]]] = None,
+        sample_size: int = 28,
+        seed: int = 123,
         max_chars_per_text: int = 800,
     ) -> str:
         """
-        Builds the few-shot block for Type Extraction.
-        This method samples documents based on finding an associated term/type within the text.
+        Build and cache the few-shot block for type (class) extraction.
+        Prefers doc->types derived from `terms_to_types`; if absent, falls back to treating
+        associated terms as "types" for stratification (behavior-preserving fallback).
+        Args:
+            documents: Documents with keys: {"id","title","text"}.
+            terms_to_documents: Mapping term -> list of doc IDs.
+            terms_to_types: Optional mapping term -> list of types.
+            sample_size: Desired number of examples in the block.
+            seed: RNG seed (local to this call).
+            max_chars_per_text: Text truncation limit per example.
+        Returns:
+            The formatted few-shot example block string.
         """
-        # Load documents into dict by ID
-        docs_by_id = {}
-        with open(docs_jsonl, "r", encoding="utf-8") as file_handle:
-            for line in file_handle:
-                line_stripped = line.strip()
-                if line_stripped:
-                    try:
-                        doc = json.loads(line_stripped)
-                        doc_id = doc.get("id", "")
-                        if doc_id:
-                            docs_by_id[doc_id] = doc
-                    except json.JSONDecodeError:
-                        continue
-        # Load term -> [doc_id,...] map
-        with open(terms2doc_json, "r", encoding="utf-8") as file_handle:
-            term_to_doc_map = json.load(file_handle)
-        flags = 0 if case_sensitive else re.IGNORECASE
-        prompt_lines: List[str] = []
-        # Iterate over terms (which act as types in this context)
-        for term, doc_ids in term_to_doc_map.items():
-            escaped_term = re.escape(term)
-            # Create regex pattern for matching the term in the text
-            pattern = rf"\b{escaped_term}\b" if full_word else escaped_term
-            term_regex = re.compile(pattern, flags=flags)
-            picked_count = 0
-            for doc_id in doc_ids:
-                doc = docs_by_id.get(doc_id)
-                if not doc:
-                    continue
-                title = doc.get("title", "")
-                text = doc.get("text", "")
-                # Check if the term/type is actually present in the document text/title
-                if term_regex.search(f"{title} {text}"):
-                    text_content = text
-                    # Truncate text if necessary
-                    if max_chars_per_text and len(text_content) > max_chars_per_text:
-                        text_content = text_content[:max_chars_per_text] + "…"
-                    # Escape single quotes in the term for Python list formatting in the prompt
-                    term_for_prompt = term.replace("'", "\\'")
-                    prompt_lines.append(
-                        f"Document ID: {doc_id}\nTitle: {title}\nText: {text_content}\nAssociated Types: ['{term_for_prompt}']\n----------------------------------------"
-                    )
-                    picked_count += 1
-                    if picked_count >= sample_per_term:
-                        break  # Move to the next term
-        prompt_block = "\n".join(prompt_lines)
-        self.fewshot_types_block = prompt_block
-        return prompt_block
+        rng = random.Random(seed)
-    def fit(
-        self,
-        train_docs_jsonl: str,
-        terms2doc_json: str,
-        sample_size: int = 28,
-        seed: int = 123,
-    ) -> None:
+        documents_by_id = {str(d.get("id", "")): d for d in documents if d.get("id", "")}
+        document_to_types = self._derive_document_to_types(terms_to_documents, terms_to_types)
+        if not document_to_types:
+            document_to_types = self._invert_terms_to_docs_mapping(terms_to_documents)
+        type_to_documents: DefaultDict[str, List[Dict[str, Any]]] = defaultdict(list)
+        for document_id, candidate_types in document_to_types.items():
+            document = documents_by_id.get(document_id)
+            if not document:
+                continue
+            for candidate_type in candidate_types:
+                type_to_documents[str(candidate_type)].append(document)
+        total_documents = len(documents)
+        target_sample_count = min(int(sample_size), total_documents)
+        sampled_documents: List[Dict[str, Any]] = []
+        for docs_in_stratum in type_to_documents.values():
+            if not docs_in_stratum:
+                continue
+            proportion = len(docs_in_stratum) / max(1, total_documents)
+            stratum_quota = int(target_sample_count * proportion)
+            if stratum_quota > 0:
+                sampled_documents.extend(rng.sample(docs_in_stratum, min(stratum_quota, len(docs_in_stratum))))
+        sampled_by_id = {str(d.get("id", "")): d for d in sampled_documents if d.get("id", "")}
+        final_documents = list(sampled_by_id.values())
+        if len(final_documents) > target_sample_count:
+            final_documents = rng.sample(final_documents, target_sample_count)
+        elif len(final_documents) < target_sample_count:
+            remaining_documents = [d for d in documents if str(d.get("id", "")) not in sampled_by_id]
+            additional_needed = min(target_sample_count - len(final_documents), len(remaining_documents))
+            if additional_needed > 0:
+                final_documents.extend(rng.sample(remaining_documents, additional_needed))
+        lines: List[str] = []
+        for document in final_documents:
+            document_id = str(document.get("id", ""))
+            title = str(document.get("title", ""))
+            text = self._truncate_text(str(document.get("text", "")), max_chars_per_text)
+            associated_types = document_to_types.get(document_id, [])
+            associated_types_escaped = [t.replace("'", "\\'") for t in associated_types]
+            lines.append(
+                "Document ID: {doc_id}\n"
+                "Title: {title}\n"
+                "Text: {text}\n"
+                "Associated Types: {types}\n"
+                "----------------------------------------".format(
+                    doc_id=document_id,
+                    title=title,
+                    text=text,
+                    types=associated_types_escaped,
+                )
+            )
+        self.few_shot_types_block = "\n".join(lines)
+        return self.few_shot_types_block
+    def _format_term_prompt(self, example_block: str, title: str, text: str) -> str:
         """
-        Fit phase: Builds and caches the few-shot prompt blocks from the training files.
-        No model training occurs (Few-Shot/In-Context Learning).
+        Format a prompt for term extraction.
+        Args:
+            example_block: Few-shot examples block.
+            title: Document title.
+            text: Document text.
+        Returns:
+            Prompt string.
         """
-        # Build prompt block for Term extraction
-        _ = self.build_stratified_fewshot_prompt(
-            train_docs_jsonl, terms2doc_json, sample_size=sample_size, seed=seed
+        return (
+            f"{example_block}\n"
+            "[var]\n"
+            f"Title: {title}\n"
+            f"Text: {text}\n"
+            "[var]\n"
+            "Extract all relevant terms that could form the basis of an ontology from the above document.\n"
+            "Return ONLY a Python list like ['term1', 'term2', ...] and nothing else.\n"
+            "If no terms are found, return [].\n"
         )
-        # Build prompt block for Type extraction
-        _ = self.build_types_fewshot_block(
-            train_docs_jsonl, terms2doc_json, sample_per_term=1
+    def _format_type_prompt(self, example_block: str, title: str, text: str) -> str:
+        """
+        Format a prompt for type (class) extraction.
+        Args:
+            example_block: Few-shot examples block.
+            title: Document title.
+            text: Document text.
+        Returns:
+            Prompt string.
+        """
+        return (
+            f"{example_block}\n"
+            "[var]\n"
+            f"Title: {title}\n"
+            f"Text: {text}\n"
+            "[var]\n"
+            "Extract all relevant TYPES mentioned in the above document that could serve as ontology classes.\n"
+            "Only consider content inside the [var] ... [var] block.\n"
+            "Return ONLY a valid Python list like ['type1', 'type2'] and nothing else. If none, return [].\n"
         )
-    # -------------------------
-    # Inference helpers (prompt construction and output parsing)
-    # -------------------------
-    def _build_term_prompt(self, example_block: str, title: str, text: str) -> str:
-        """Constructs the full prompt for Term Extraction."""
-        return f"""{example_block}
-            [var]
-            Title: {title}
-            Text: {text}
-            [var]
-            Extract all relevant terms that could form the basis of an ontology from the above document.
-            Return ONLY a Python list like ['term1', 'term2', ...] and nothing else.
-            If no terms are found, return [].
-            """
-    def _build_type_prompt(self, example_block: str, title: str, text: str) -> str:
-        """Constructs the full prompt for Type Extraction."""
-        return f"""{example_block}
-            [var]
-            Title: {title}
-            Text: {text}
-            [var]
-            Extract all relevant TYPES mentioned in the above document that could serve as ontology classes.
-            Only consider content inside the [var] ... [var] block.
-            Return ONLY a valid Python list like ['type1', 'type2'] and nothing else. If none, return [].
-            """
-    def _parse_list_like(self, raw_string: str) -> List[str]:
-        """Try to extract a Python list of strings from model output robustly."""
-        processed_string = raw_string.strip()
-        if processed_string in ("[]", ""):
+    def _parse_python_list_of_strings(self, raw_text: str) -> List[str]:
+        """
+        Parse an LLM response intended to be a Python list of strings.
+        This parser is intentionally tolerant:
+          1) Try literal_eval on the full string
+          2) Else extract the first [...] block and literal_eval it
+          3) Else fallback to extracting quoted strings
+        Args:
+            raw_text: Model output.
+        Returns:
+            List of strings (possibly empty).
+        """
+        stripped = (raw_text or "").strip()
+        if stripped in ("", "[]"):
             return []
-        # 1. Try direct evaluation
         try:
-            parsed_value = ast.literal_eval(processed_string)
-            if isinstance(parsed_value, list):
-                # Filter to ensure only strings are returned
-                return [item for item in parsed_value if isinstance(item, str)]
+            parsed = ast.literal_eval(stripped)
+            if isinstance(parsed, list):
+                return [item for item in parsed if isinstance(item, str)]
         except Exception:
             pass
-        # 2. Try finding and evaluating text within outermost brackets [ ... ]
-        bracket_match = re.search(r"\[[\s\S]*?\]", processed_string)
-        if bracket_match:
+        match = re.search(r"\[[\s\S]*?\]", stripped)
+        if match:
             try:
-                parsed_value = ast.literal_eval(bracket_match.group(0))
-                if isinstance(parsed_value, list):
-                    return [item for item in parsed_value if isinstance(item, str)]
+                parsed = ast.literal_eval(match.group(0))
+                if isinstance(parsed, list):
+                    return [item for item in parsed if isinstance(item, str)]
             except Exception:
                 pass
-        # 3. Fallback: Find comma-separated quoted substrings (less robust, but catches errors)
-        # Finds content inside either single quotes ('...') or double quotes ("...")
-        quoted_matches = re.findall(r"'([^']+)'|\"([^\"]+)\"", processed_string)
-        flattened_list = [a_match or b_match for a_match, b_match in quoted_matches]
-        return flattened_list
-    def _call_model_one(self, prompt: str, max_new_tokens: int = 120) -> str:
-        """Calls the underlying LocalAutoLLM for a single prompt. Returns the raw tail output."""
-        # self.model is an instance of LocalAutoLLM
-        model_output = self.model.generate(
-            [prompt], max_new_tokens=max_new_tokens, temperature=0.0, top_p=1.0
-        )
-        return model_output[0] if model_output else ""
+        quoted = re.findall(r"'([^']+)'|\"([^\"]+)\"", stripped)
+        return [a or b for a, b in quoted]
-    def predict_terms(
-        self,
-        docs_test_jsonl: str,
-        out_jsonl: str,
-        max_lines: int = -1,
-        max_new_tokens: int = 120,
-    ) -> int:
+    def _generate_completion(self, prompt_text: str) -> str:
         """
-        Runs Term Extraction on the test documents and saves results to a JSONL file.
-        Returns: The count of individual terms written.
+        Generate a completion for a single prompt (deterministic decoding).
+        Args:
+            prompt_text: Full prompt to send to the model.
+        Returns:
+            The generated completion text (prompt stripped where possible).
         """
-        if not self.fewshot_terms_block:
-            raise RuntimeError("Few-shot block for terms is empty. Call fit() first.")
-        num_written_terms = 0
-        with (
-            open(docs_test_jsonl, "r", encoding="utf-8") as file_in,
-            open(out_jsonl, "w", encoding="utf-8") as file_out,
-        ):
-            for line_index, line in enumerate(file_in, start=1):
-                if 0 < max_lines < line_index:
-                    break
-                try:
-                    document = json.loads(line.strip())
-                except Exception:
-                    continue  # Skip malformed JSON lines
-                doc_id = document.get("id", "unknown")
-                title = document.get("title", "")
-                text = document.get("text", "")
-                # Construct and call model
-                prompt = self._build_term_prompt(self.fewshot_terms_block, title, text)
-                raw_output = self._call_model_one(prompt, max_new_tokens=max_new_tokens)
-                predicted_terms = self._parse_list_like(raw_output)
-                # Write extracted terms
-                for term_or_type in predicted_terms:
-                    if isinstance(term_or_type, str) and term_or_type.strip():
-                        file_out.write(
-                            json.dumps({"doc_id": doc_id, "term": term_or_type.strip()})
-                            + "\n"
-                        )
-                        num_written_terms += 1
-                # Lightweight memory management for long runs
-                if line_index % 50 == 0:
-                    gc.collect()
-                    if torch.cuda.is_available():
-                        torch.cuda.empty_cache()
-        return num_written_terms
-    def predict_types(
+        if self.model is None or self.tokenizer is None:
+            raise RuntimeError("Model/tokenizer not loaded. Call .load() first.")
+        encoded = self.tokenizer([prompt_text], return_tensors="pt", padding=True, truncation=True)
+        input_ids = encoded["input_ids"]
+        attention_mask = encoded["attention_mask"]
+        model_device = next(self.model.parameters()).device
+        input_ids = input_ids.to(model_device)
+        attention_mask = attention_mask.to(model_device)
+        with torch.no_grad():
+            output_ids = self.model.generate(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_new_tokens=self.max_new_tokens,
+                do_sample=False,
+                temperature=0.0,
+                top_p=1.0,
+                pad_token_id=self.tokenizer.eos_token_id,
+            )[0]
+        decoded_full = self.tokenizer.decode(output_ids, skip_special_tokens=True)
+        decoded_prompt = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
+        if decoded_full.startswith(decoded_prompt):
+            return decoded_full[len(decoded_prompt) :].strip()
+        prompt_token_count = int(attention_mask[0].sum().item())
+        return self.tokenizer.decode(output_ids[prompt_token_count:], skip_special_tokens=True).strip()
+    def fit(
         self,
-        docs_test_jsonl: str,
-        out_jsonl: str,
-        max_lines: int = -1,
-        max_new_tokens: int = 120,
-    ) -> int:
-        """
-        Runs Type Extraction on the test documents and saves results to a JSONL file.
-        Returns: The count of individual types written.
-        """
-        if not self.fewshot_types_block:
-            raise RuntimeError("Few-shot block for types is empty. Call fit() first.")
-        num_written_types = 0
-        with (
-            open(docs_test_jsonl, "r", encoding="utf-8") as file_in,
-            open(out_jsonl, "w", encoding="utf-8") as file_out,
-        ):
-            for line_index, line in enumerate(file_in, start=1):
-                if 0 < max_lines < line_index:
-                    break
-                try:
-                    document = json.loads(line.strip())
-                except Exception:
-                    continue  # Skip malformed JSON lines
-                doc_id = document.get("id", "unknown")
-                title = document.get("title", "")
-                text = document.get("text", "")
-                # Construct and call model using the dedicated type prompt block
-                prompt = self._build_type_prompt(self.fewshot_types_block, title, text)
-                raw_output = self._call_model_one(prompt, max_new_tokens=max_new_tokens)
-                predicted_types = self._parse_list_like(raw_output)
-                # Write extracted types
-                for term_or_type in predicted_types:
-                    if isinstance(term_or_type, str) and term_or_type.strip():
-                        file_out.write(
-                            json.dumps({"doc_id": doc_id, "type": term_or_type.strip()})
-                            + "\n"
-                        )
-                        num_written_types += 1
-                if line_index % 50 == 0:
-                    gc.collect()
-                    if torch.cuda.is_available():
-                        torch.cuda.empty_cache()
-        return num_written_types
-    # --- Evaluation utilities (unchanged from prior definition, added docstrings) ---
-    def load_gold_pairs(self, terms2doc_path: str) -> Set[Tuple[str, str]]:
-        """Convert terms2docs JSON into a set of unique (doc_id, term) pairs, lowercased."""
-        gold_pairs = set()
-        with open(terms2doc_path, "r", encoding="utf-8") as file_handle:
-            term_to_doc_map = json.load(file_handle)
-        for term, doc_ids in term_to_doc_map.items():
-            clean_term = term.strip().lower()
-            for doc_id in doc_ids:
-                gold_pairs.add((doc_id, clean_term))
-        return gold_pairs
-    def load_predicted_pairs(
-        self, predicted_jsonl_path: str, key: str = "term"
-    ) -> Set[Tuple[str, str]]:
-        """Load predicted (doc_id, term/type) pairs from a JSONL file, lowercased."""
-        predicted_pairs = set()
-        with open(predicted_jsonl_path, "r", encoding="utf-8") as file_handle:
-            for line in file_handle:
-                try:
-                    entry = json.loads(line.strip())
-                except Exception:
-                    continue
-                doc_id = entry.get("doc_id")
-                value = entry.get(key)
-                if doc_id and value:
-                    predicted_pairs.add((doc_id, value.strip().lower()))
-        return predicted_pairs
-    def evaluate_extraction_f1(
-        self, terms2doc_path: str, predicted_jsonl: str, key: str = "term"
-    ) -> float:
+        train_data: Any,
+        task: str = "text2onto",
+        ontologizer: bool = False,
+        **kwargs: Any,
+    ) -> None:
         """
-        Computes set-based binary Precision, Recall, and F1 score against the gold pairs.
+        Build and cache few-shot blocks from the training split.
+        Args:
+            train_data: A split bundle dict. Must contain "documents" and "terms2docs".
+            task: Must be "text2onto".
+            ontologizer: Unused here (kept for signature compatibility).
+            **kwargs:
+                sample_size: Few-shot sample size per block (default 28).
+                seed: RNG seed (default 123).
         """
-        # Load the ground truth and predictions
-        gold_set = self.load_gold_pairs(terms2doc_path)
-        predicted_set = self.load_predicted_pairs(predicted_jsonl, key=key)
+        if task != "text2onto":
+            raise ValueError(f"{self.__class__.__name__} only supports task='text2onto' (got {task!r}).")
-        # Build combined universe of all pairs for score calculation
-        all_pairs = sorted(gold_set | predicted_set)
+        if not self._is_loaded:
+            self.load(model_id=self._default_model_id)
-        # Create binary labels (1=present, 0=absent)
-        y_true = [1 if pair in gold_set else 0 for pair in all_pairs]
-        y_pred = [1 if pair in predicted_set else 0 for pair in all_pairs]
+        documents: List[Dict[str, Any]] = train_data.get("documents", []) or []
+        terms_to_documents: Dict[str, List[str]] = train_data.get("terms2docs", {}) or {}
+        terms_to_types: Optional[Dict[str, List[str]]] = train_data.get("terms2types", None)
-        # Use scikit-learn for metric calculation
-        from sklearn.metrics import precision_recall_fscore_support
+        sample_size = int(kwargs.get("sample_size", 28))
+        seed = int(kwargs.get("seed", 123))
-        precision, recall, f1, _ = precision_recall_fscore_support(
-            y_true, y_pred, average="binary", zero_division=0
+        self.build_few_shot_terms_block(
+            documents=documents,
+            terms_to_documents=terms_to_documents,
+            sample_size=sample_size,
+            seed=seed,
+        )
+        self.build_few_shot_types_block(
+            documents=documents,
+            terms_to_documents=terms_to_documents,
+            terms_to_types=terms_to_types,
+            sample_size=sample_size,
+            seed=seed,
         )
-        # Display results
-        num_true_positives = len(gold_set & predicted_set)
+    def predict(
+        self,
+        test_data: Any,
+        task: str = "text2onto",
+        ontologizer: bool = False,
+        **kwargs: Any,
+    ) -> Dict[str, Any]:
+        """
+        Run term/type extraction over test documents.
-        print("\n📊 Evaluation Results:")
-        print(f"   ✅ Precision: {precision:.4f}")
-        print(f"   ✅ Recall:    {recall:.4f}")
-        print(f"   ✅ F1 Score:  {f1:.4f}")
-        print(f"   📌 Gold pairs:      {len(gold_set)}")
-        print(f"   📌 Predicted pairs:{len(predicted_set)}")
-        print(f"   🎯 True Positives: {num_true_positives}")
+        Args:
+            test_data: A split bundle dict. Must contain "documents".
+            task: Must be "text2onto".
+            ontologizer: Unused here (kept for signature compatibility).
+            **kwargs:
+                max_docs: If > 0, limit number of docs processed.
-        return float(f1)
+        Returns:
+            Prediction payload dict: {"terms": [...], "types": [...]}.
+        """
+        if task != "text2onto":
+            raise ValueError(f"{self.__class__.__name__} only supports task='text2onto' (got {task!r}).")
+        if not self.few_shot_terms_block or not self.few_shot_types_block:
+            raise RuntimeError("Few-shot blocks are empty. Pipeline should call fit() before predict().")
+        max_docs = int(kwargs.get("max_docs", -1))
+        documents: List[Dict[str, Any]] = test_data.get("documents", []) or []
+        if max_docs > 0:
+            documents = documents[:max_docs]
+        term_predictions: List[Dict[str, str]] = []
+        type_predictions: List[Dict[str, str]] = []
+        for doc_index, document in enumerate(documents, start=1):
+            document_id = str(document.get("id", "unknown"))
+            title = str(document.get("title", ""))
+            text = str(document.get("text", ""))
+            term_prompt = self._format_term_prompt(self.few_shot_terms_block, title, text)
+            extracted_terms = self._parse_python_list_of_strings(self._generate_completion(term_prompt))
+            for term in extracted_terms:
+                normalized_term = (term or "").strip()
+                if normalized_term:
+                    term_predictions.append({"doc_id": document_id, "term": normalized_term})
+            type_prompt = self._format_type_prompt(self.few_shot_types_block, title, text)
+            extracted_types = self._parse_python_list_of_strings(self._generate_completion(type_prompt))
+            for extracted_type in extracted_types:
+                normalized_type = (extracted_type or "").strip()
+                if normalized_type:
+                    type_predictions.append({"doc_id": document_id, "type": normalized_type})
+            if doc_index % 50 == 0:
+                gc.collect()
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+        # IMPORTANT: return only the prediction payload; LearnerPipeline wraps it.
+        return {"terms": term_predictions, "types": type_predictions}

OntoLearner 1.4.10__py3-none-any.whl → 1.5.0__py3-none-any.whl

OntoLearner 1.4.10py3-none-any.whl → 1.5.0py3-none-any.whl