PyPI - ebm4subjects - Versions diffs - 0.4.1__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

ebm4subjects 0.4.1py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

ebm4subjects/analyzer.py +3 -2
ebm4subjects/chunker.py +25 -20
ebm4subjects/duckdb_client.py +8 -8
ebm4subjects/ebm_logging.py +11 -8
ebm4subjects/ebm_model.py +105 -75
ebm4subjects/embedding_generator.py +12 -5
{ebm4subjects-0.4.1.dist-info → ebm4subjects-0.5.1.dist-info}/METADATA +2 -3
ebm4subjects-0.5.1.dist-info/RECORD +12 -0
ebm4subjects-0.4.1.dist-info/RECORD +0 -12
{ebm4subjects-0.4.1.dist-info → ebm4subjects-0.5.1.dist-info}/WHEEL +0 -0
{ebm4subjects-0.4.1.dist-info → ebm4subjects-0.5.1.dist-info}/licenses/LICENSE +0 -0

ebm4subjects/analyzer.py CHANGED Viewed

@@ -32,8 +32,9 @@ class EbmAnalyzer:
             nltk.data.find(tokenizer_name)
         # If the tokenizer is not found, try to download it
         except LookupError as error:
-            if tokenizer_name in str(error):
-                nltk.download(tokenizer_name)
+            if "punkt" in str(error):
+                nltk.download("punkt")
+                nltk.download("punkt_tab")
             else:
                 raise

ebm4subjects/chunker.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from concurrent.futures import ProcessPoolExecutor
 from math import ceil
-from typing import Tuple
+from typing import Any, Tuple
 import polars as pl
@@ -17,9 +17,9 @@ class Chunker:
     Attributes:
         tokenizer (EbmAnalyzer): The tokenizer used for tokenizing sentences.
-        max_chunks (int): The maximum number of chunks to generate.
-        max_chunk_size (int): The maximum size of each chunk in characters.
-        max_sentences (int): The maximum number of sentences to consider.
+        max_chunk_count (int): The maximum number of chunks to generate.
+        max_chunk_length (int): The maximum size of each chunk in characters.
+        max_sentence_count (int): The maximum number of sentences to consider.
     Methods:
         - chunk_text: Chunks a given text into smaller sections
@@ -28,25 +28,30 @@ class Chunker:
     def __init__(
         self,
-        tokenizer_name: str,
-        max_chunks: int | None,
-        max_chunk_size: int | None,
-        max_sentences: int | None,
+        tokenizer: Any,
+        max_chunk_count: int | None,
+        max_chunk_length: int | None,
+        max_sentence_count: int | None,
     ):
         """
         Initializes the Chunker.
         Args:
-            tokenizer_name (str): The name of the tokenizer to use.
-            max_chunks (int | None): The maximum number of chunks to generate.
-            max_chunk_size (int | None): The maximum size of each chunk in characters.
-            max_sentences (int | None): The maximum number of sentences to consider.
+            tokenizer (Any): The name of the tokenizer to use or the tokenizer itself.
+            max_chunk_count (int | None): The maximum number of chunks to generate.
+            max_chunk_length (int | None): The maximum size of each chunk in characters.
+            max_sentence_count (int | None): The maximum number of sentences to consider.
         """
-        self.max_chunks = max_chunks if max_chunks else float("inf")
-        self.max_chunk_size = max_chunk_size if max_chunk_size else float("inf")
-        self.max_sentences = max_sentences if max_sentences else float("inf")
+        self.max_chunk_count = max_chunk_count if max_chunk_count else float("inf")
+        self.max_chunk_length = max_chunk_length if max_chunk_length else float("inf")
+        self.max_sentence_count = (
+            max_sentence_count if max_sentence_count else float("inf")
+        )
-        self.tokenizer = EbmAnalyzer(tokenizer_name)
+        if type(tokenizer) is str:
+            self.tokenizer = EbmAnalyzer(tokenizer)
+        else:
+            self.tokenizer = tokenizer
     def chunk_text(self, text: str) -> list[str]:
         """
@@ -63,7 +68,7 @@ class Chunker:
         # Tokenize the text into sentences
         sentences = self.tokenizer.tokenize_sentences(text)
-        sentences = sentences[: self.max_sentences]
+        sentences = sentences[: self.max_sentence_count]
         # Initialize an empty list to store the current chunk
         current_chunk = []
@@ -71,18 +76,18 @@ class Chunker:
         # Iterate over the sentences
         for sentence in sentences:
             # If the current chunk is not full, add the sentence to it
-            if len(" ".join(current_chunk)) < self.max_chunk_size:
+            if len(" ".join(current_chunk)) < self.max_chunk_length:
                 current_chunk.append(sentence)
             # Otherwise, add the current chunk to the list of chunks
             # and start a new chunk
             else:
                 chunks.append(" ".join(current_chunk))
                 current_chunk = [sentence]
-                if len(chunks) == self.max_chunks:
+                if len(chunks) == self.max_chunk_count:
                     break
         # If the maximum number of chunks is reached, break the loop
-        if current_chunk and len(chunks) < self.max_chunks:
+        if current_chunk and len(chunks) < self.max_chunk_count:
             chunks.append(" ".join(current_chunk))
         # Return the chunked text

ebm4subjects/duckdb_client.py CHANGED Viewed

@@ -37,8 +37,8 @@ class Duckdb_client:
                 (default: {"M": 32, "ef_construction": 256, "ef_search": 256}).
         Notes:
-            'hnsw_enable_experimental_persistence' needs to be set to 'True' in order
-            to store and query the index later
+            'hnsw_enable_experimental_persistence' needs to be set to 'True' in order
+            to store and query the index later
         """
         # Establish a connection to the DuckDB database
         self.connection = duckdb.connect(
@@ -76,10 +76,10 @@ class Duckdb_client:
                 (default: "cosine")
             force (bool, optional): Whether to replace the existing collection if it
                 already exists (default: False).
         Notes:
-            If 'hnsw_metric' is changed in this function 'hnsw_metric_function' in
-            the vector_search function needs to be changed accordingly in order
+            If 'hnsw_metric' is changed in this function 'hnsw_metric_function' in
+            the vector_search function needs to be changed accordingly in order
             for the index to work properly.
         """
         # Determine whether to replace the existing collection
@@ -147,10 +147,10 @@ class Duckdb_client:
             pl.DataFrame: The result of the vector search.
         Notes:
-            If 'hnsw_metric_function' is changed in this function 'hnsw_metric' in
-            the create_collection function needs to be changed accordingly in order
+            If 'hnsw_metric_function' is changed in this function 'hnsw_metric' in
+            the create_collection function needs to be changed accordingly in order
             for the index to work properly.
-            The argument 'chunk_size' is already set to the optimal value for the
+            The argument 'chunk_size' is already set to the optimal value for the
             query processing with DuckDB. Only change it if necessary.
         """
         # Create a temporary table to store the search results

ebm4subjects/ebm_logging.py CHANGED Viewed

@@ -7,7 +7,7 @@ class EbmLogger:
     """
     A custom logger class.
-    This class provides a way to log messages at different levels
+    This class provides a way to log messages at different levels
     (error, warning, info, debug) to a file.
     It also provides a way to get the logger instance.
@@ -16,6 +16,7 @@ class EbmLogger:
         log_path (str): The path to the log file.
         level (str): The log level (default: "info").
     """
     def __init__(self, log_path: str, level: str = "info") -> None:
         """
         Initializes the logger.
@@ -66,6 +67,7 @@ class NullLogger:
     This class is used when no logging is needed.
     """
     def __init__(self) -> None:
         """
         Initializes the null logger.
@@ -136,16 +138,17 @@ class NullLogger:
 class XGBLogging(xgboost.callback.TrainingCallback):
     """
     Custom XGBoost training callback for logging model performance during training.
     Args:
         logger (logging.Logger): Logger instance to use for logging.
         epoch_log_interval (int, optional): Interval at which to log model performance
             (default: 100).
     Attributes:
         logger (logging.Logger): Logger instance used for logging.
         epoch_log_interval (int): Interval at which to log model performance.
     """
     def __init__(
         self,
         logger: logging.Logger,
@@ -153,10 +156,10 @@ class XGBLogging(xgboost.callback.TrainingCallback):
     ) -> None:
         """
         Initializes the XGBLogger.
         Args:
             logger (logging.Logger): Logger instance to use for logging.
-            epoch_log_interval (int, optional): Interval at which to log model
+            epoch_log_interval (int, optional): Interval at which to log model
                 performance (default: to 100).
         """
         # Logger instance used for logging
@@ -172,14 +175,14 @@ class XGBLogging(xgboost.callback.TrainingCallback):
     ) -> bool:
         """
         Callback function called after each iteration of the XGBoost training process.
         Logs model performance at the specified interval.
         Args:
             model (xgboost.Booster): XGBoost model instance.
             epoch (int): Current epoch number.
             evals_log (dict): Dictionary containing evaluation metrics.
         Returns:
             bool: Always returns False, as specified by the XGBoost callback API.
         """

ebm4subjects/ebm_model.py CHANGED Viewed

@@ -1,7 +1,9 @@
 from __future__ import annotations
 import ast
+import logging
 from pathlib import Path
+from typing import Any
 import joblib
 import polars as pl
@@ -17,30 +19,31 @@ from ebm4subjects.embedding_generator import EmbeddingGenerator
 class EbmModel:
     def __init__(
         self,
-        db_path: str,
-        collection_name: str,
-        use_altLabels: bool,
-        duckdb_threads: int | str,
-        embedding_model_name: str,
+        embedding_model_name: str | Any,
         embedding_dimensions: int | str,
-        chunk_tokenizer: str,
-        max_chunks: int | str,
-        max_chunk_size: int | str,
+        chunk_tokenizer: str | Any,
+        max_chunk_count: int | str,
+        max_chunk_length: int | str,
         chunking_jobs: int | str,
-        max_sentences: int | str,
-        max_query_hits: int | str,
-        query_top_k: int | str,
+        max_sentence_count: int | str,
+        candidates_per_chunk: int | str,
+        candidates_per_doc: int | str,
         query_jobs: int | str,
         xgb_shrinkage: float | str,
         xgb_interaction_depth: int | str,
         xgb_subsample: float | str,
         xgb_rounds: int | str,
         xgb_jobs: int | str,
+        duckdb_threads: int | str,
+        db_path: str,
+        collection_name: str = "my_collection",
+        use_altLabels: bool = True,
         hnsw_index_params: dict | str | None = None,
         model_args: dict | str | None = None,
         encode_args_vocab: dict | str | None = None,
         encode_args_documents: dict | str | None = None,
         log_path: str | None = None,
+        logger: logging.Logger | None = None,
     ) -> None:
         """
         A class representing an Embedding-Based-Matching (EBM) model
@@ -109,14 +112,14 @@ class EbmModel:
         # Parameters for chunker
         self.chunk_tokenizer = chunk_tokenizer
-        self.max_chunks = int(max_chunks)
-        self.max_chunk_size = int(max_chunk_size)
-        self.max_sentences = int(max_sentences)
+        self.max_chunk_count = int(max_chunk_count)
+        self.max_chunk_length = int(max_chunk_length)
+        self.max_sentence_count = int(max_sentence_count)
         self.chunking_jobs = int(chunking_jobs)
         # Parameters for vector search
-        self.max_query_hits = int(max_query_hits)
-        self.query_top_k = int(query_top_k)
+        self.candidates_per_chunk = int(candidates_per_chunk)
+        self.candidates_per_doc = int(candidates_per_doc)
         self.query_jobs = int(query_jobs)
         # Parameters for XGB boost ranker
@@ -126,17 +129,8 @@ class EbmModel:
         self.train_rounds = int(xgb_rounds)
         self.train_jobs = int(xgb_jobs)
-        # Parameters for logger
-        # Only create logger if path to log file is set
-        self.logger = None
-        self.xgb_logger = None
-        self.xgb_callbacks = None
-        if log_path:
-            self.logger = EbmLogger(log_path, "info").get_logger()
-            self.xgb_logger = XGBLogging(self.logger, epoch_log_interval=1)
-            self.xgb_callbacks = [self.xgb_logger]
-        else:
-            self.logger = NullLogger()
+        # Initiliaze logging
+        self.init_logger(log_path, logger)
         # Initialize EBM model
         self.model = None
@@ -153,7 +147,9 @@ class EbmModel:
             None
         """
         if self.client is None:
-            self.logger.info("Initializing DuckDB client")
+            self.logger.info(
+                f"initializing DuckDB client with duckdb_threads: {self.duckdb_threads}"
+            )
             self.client = Duckdb_client(
                 db_path=self.db_path,
@@ -175,14 +171,35 @@ class EbmModel:
             None
         """
         if self.generator is None:
-            self.logger.info("Initializing embedding generator")
+            self.logger.info("initializing embedding generator")
             self.generator = EmbeddingGenerator(
                 model_name=self.embedding_model_name,
                 embedding_dimensions=self.embedding_dimensions,
                 **self.model_args,
             )
+    def init_logger(
+        self, log_path: str | None = None, logger: logging.Logger | None = None
+    ) -> None:
+        """
+        Initializes the logging for the EBM model.
+        Returns:
+            None
+        """
+        if log_path:
+            self.logger = EbmLogger(log_path, "info").get_logger()
+            self.xgb_logger = XGBLogging(self.logger, epoch_log_interval=1)
+            self.xgb_callbacks = [self.xgb_logger]
+        elif logger:
+            self.logger = logger
+            self.xgb_logger = XGBLogging(self.logger, epoch_log_interval=1)
+            self.xgb_callbacks = [self.xgb_logger]
+        else:
+            self.logger = NullLogger()
+            self.xgb_logger = None
+            self.xgb_callbacks = None
     def create_vector_db(
         self,
         vocab_in_path: str | None = None,
@@ -213,12 +230,12 @@ class EbmModel:
         # Check if output path exists and load existing vocabulary if so
         if vocab_out_path and Path(vocab_out_path).exists():
             self.logger.info(
-                f"Loading vocabulary with embeddings from {vocab_out_path}"
+                f"loading vocabulary with embeddings from {vocab_out_path}"
             )
             collection_df = pl.read_ipc(vocab_out_path)
         # Parse input vocabulary if provided
         elif vocab_in_path:
-            self.logger.info("Parsing vocabulary")
+            self.logger.info("parsing vocabulary")
             vocab = prepare_data.parse_vocab(
                 vocab_path=vocab_in_path,
                 use_altLabels=self.use_altLabels,
@@ -226,7 +243,7 @@ class EbmModel:
             # Initialize generator and add embeddings to vocabulary
             self._init_generator()
-            self.logger.info("Adding embeddings to vocabulary")
+            self.logger.info("adding embeddings to vocabulary")
             collection_df = prepare_data.add_vocab_embeddings(
                 vocab=vocab,
                 generator=self.generator,
@@ -238,20 +255,20 @@ class EbmModel:
                 # Check if file already exists and warn if so
                 if Path(vocab_out_path).exists() and not force:
                     self.logger.warn(
-                        f"""Cant't save vocabulary to {vocab_out_path}.
+                        f"""cant't save vocabulary to {vocab_out_path}.
                         File already exists"""
                     )
                 else:
-                    self.logger.info(f"Saving vocabulary to {vocab_out_path}")
+                    self.logger.info(f"saving vocabulary to {vocab_out_path}")
                     collection_df.write_ipc(vocab_out_path)
         else:
             # If no existing vocabulary and no input vocabulary is provided,
             # raise an error
-            raise ValueError("Vocabulary path is required")
+            raise ValueError("vocabulary path is required")
         # Initialize DuckDB client and create collection
         self._init_duckdb_client()
-        self.logger.info("Creating collection")
+        self.logger.info("creating collection")
         self.client.create_collection(
             collection_df=collection_df,
             collection_name=self.collection_name,
@@ -286,8 +303,6 @@ class EbmModel:
         Returns:
             pl.DataFrame: The prepared training data.
         """
-        self.logger.info("Preparing training data")
         # Check if pre-computed candidate training data is provided
         if not train_candidates:
             # If not, generate candidate training data in batches
@@ -306,7 +321,6 @@ class EbmModel:
                 )
         # Create a gold standard data frame from the provided doc IDs and label IDs
-        self.logger.info("Preparing gold standard")
         gold_standard = pl.DataFrame(
             {
                 "doc_id": doc_ids,
@@ -318,7 +332,7 @@ class EbmModel:
         # Compare the candidate training data to the gold standard
         # and prepare data for the training of the XGB ranker model
-        self.logger.info("Prepare training data and gold standard for training")
+        self.logger.info("prepare training data and gold standard for training")
         training_data = (
             self._compare_to_gold_standard(train_candidates, gold_standard)
             .with_columns(pl.when(pl.col("gold")).then(1).otherwise(0).alias("gold"))
@@ -406,19 +420,19 @@ class EbmModel:
             n_jobs = self.query_jobs
         # Create a Chunker instance with specified parameters
-        self.logger.info("Chunking text")
+        self.logger.info("chunking text")
         chunker = Chunker(
-            tokenizer_name=self.chunk_tokenizer,
-            max_chunks=self.max_chunks,
-            max_chunk_size=self.max_chunk_size,
-            max_sentences=self.max_sentences,
+            tokenizer=self.chunk_tokenizer,
+            max_chunk_count=self.max_chunk_count,
+            max_chunk_length=self.max_chunk_length,
+            max_sentence_count=self.max_sentence_count,
         )
         # Chunk the input text
         text_chunks = chunker.chunk_text(text)
         # Initialize the generator
         self._init_generator()
-        self.logger.info("Creating embeddings for text chunks")
+        self.logger.info("creating embeddings for text chunks")
         # Generate embeddings for the text chunks
         embeddings = self.generator.generate_embeddings(
             # Use the text chunks as input
@@ -432,7 +446,7 @@ class EbmModel:
         )
         # Create a query DataFrame
-        self.logger.info("Creating query dataframe")
+        self.logger.info("creating query dataframe")
         query_df = pl.DataFrame(
             {
                 # Create a column for the query ID
@@ -450,7 +464,9 @@ class EbmModel:
         # Initialize the DuckDB client
         self._init_duckdb_client()
-        self.logger.info("Running vector search and creating candidates")
+        self.logger.info(
+            f"running vector search and creating candidates with query_jobs: {n_jobs}"
+        )
         # Perform vector search using the query DataFrame
         # Using the parameters specified for the EBM model
         # and the optimal chunk size for the DuckDB
@@ -459,9 +475,9 @@ class EbmModel:
             collection_name=self.collection_name,
             embedding_dimensions=self.embedding_dimensions,
             n_jobs=n_jobs,
-            n_hits=self.max_query_hits,
+            n_hits=self.candidates_per_chunk,
             chunk_size=1024,
-            top_k=self.query_top_k,
+            top_k=self.candidates_per_doc,
             hnsw_metric_function="array_cosine_distance",
         )
@@ -500,20 +516,20 @@ class EbmModel:
             query_jobs = self.query_jobs
         # Create a Chunker instance with specified parameters
-        self.logger.info("Chunking texts in batches")
         chunker = Chunker(
-            tokenizer_name=self.chunk_tokenizer,
-            max_chunks=self.max_chunks,
-            max_chunk_size=self.max_chunk_size,
-            max_sentences=self.max_sentences,
+            tokenizer=self.chunk_tokenizer,
+            max_chunk_count=self.max_chunk_count,
+            max_chunk_length=self.max_chunk_length,
+            max_sentence_count=self.max_sentence_count,
         )
         # Chunk the input texts
+        self.logger.info(f"chunking texts with chunking_jobs: {chunking_jobs}")
         text_chunks, chunk_index = chunker.chunk_batches(texts, doc_ids, chunking_jobs)
         # Initialize the generator and chunk index
         self._init_generator()
         chunk_index = pl.concat(chunk_index).with_row_index("query_id")
-        self.logger.info("Creating embeddings for text chunks and query dataframe")
+        self.logger.info("creating embeddings for text chunks and query dataframe")
         embeddings = self.generator.generate_embeddings(
             texts=text_chunks,
             **(
@@ -531,15 +547,17 @@ class EbmModel:
         # Perform vector search using the query DataFrame
         # Using the parameters specified for the EBM model
         # and the optimal chunk size for the DuckDB
-        self.logger.info("Running vector search and creating candidates")
+        self.logger.info(
+            f"running vector search and creating candidates with query_jobs: {query_jobs}"
+        )
         candidates = self.client.vector_search(
             query_df=query_df,
             collection_name=self.collection_name,
             embedding_dimensions=self.embedding_dimensions,
             n_jobs=query_jobs,
-            n_hits=self.max_query_hits,
+            n_hits=self.candidates_per_chunk,
             chunk_size=1024,
-            top_k=self.query_top_k,
+            top_k=self.candidates_per_doc,
             hnsw_metric_function="array_cosine_distance",
         )
@@ -567,8 +585,8 @@ class EbmModel:
             n_jobs = self.train_jobs
         # Select the required columns from the train_data DataFrame,
-        # convert to a Pandas DataFrame and afterwards to training matrix
-        self.logger.info("Creating training matrix")
+        # convert to a numpy array and afterwards to training matrix
+        self.logger.info("creating training matrix")
         matrix = xgb.DMatrix(
             train_data.select(
                 [
@@ -582,14 +600,16 @@ class EbmModel:
                     "is_prefLabel",
                     "n_chunks",
                 ]
-            ).to_pandas(),
+            ).to_numpy(),
             # Use the gold standard as the target
-            train_data.to_pandas()["gold"],
+            train_data.select("gold").to_numpy(),
         )
         try:
             # Train the XGBoost model with the specified parameters
-            self.logger.info("Starting training of XGBoost Ranker")
+            self.logger.info(
+                f"starting training of XGBoost Ranker with xgb_jobs: {n_jobs}"
+            )
             model = xgb.train(
                 # Train the XGBoost model with the specified parameters
                 params={
@@ -611,7 +631,7 @@ class EbmModel:
                 # Use the specified callbacks
                 callbacks=self.xgb_callbacks,
             )
-            self.logger.info("Training successful finished")
+            self.logger.info("training successful finished")
         except xgb.core.XGBoostError:
             self.logger.critical(
                 "XGBoost can't train with candidates equal to gold standard "
@@ -641,7 +661,7 @@ class EbmModel:
         """
         # Select relevant columns from the candidates DataFrame to create a matrix
         # for the trained model to make predictions
-        self.logger.info("Creating matrix of candidates to generate predictions")
+        self.logger.info("creating matrix of candidates to generate predictions")
         matrix = xgb.DMatrix(
             candidates.select(
                 [
@@ -659,7 +679,7 @@ class EbmModel:
         )
         # Use the trained model to make predictions on the created matrix
-        self.logger.info("Making predictions for candidates")
+        self.logger.info("making predictions for candidates")
         predictions = self.model.predict(matrix)
         # Transform the predictions into a list of DataFrames containing the
@@ -671,11 +691,12 @@ class EbmModel:
             .select(["doc_id", "label_id", "score"])
             # Sort the DataFrame by document ID and score in ascending and
             # descending order, respectively
+            .with_columns(pl.col("doc_id").cast(pl.Int64))
             .sort(["doc_id", "score"], descending=[False, True])
             # Group the DataFrame by document ID and aggregate the top-k labels
             # and scores for each group
             .group_by("doc_id")
-            .agg(pl.all().head(self.query_top_k))
+            .agg(pl.all().head(self.candidates_per_doc))
             # Explode the aggregated DataFrame to create separate rows for each
             # label and score
             .explode(["label_id", "score"])
@@ -683,7 +704,7 @@ class EbmModel:
             .partition_by("doc_id")
         )
-    def save(self, output_path: str) -> None:
+    def save(self, output_path: str) -> list[str]:
         """
         Saves the current state of the EBM model to a file using joblib.
@@ -694,12 +715,18 @@ class EbmModel:
         Args:
             output_path: The file path where the serialized model will be written.
+        Returns:
+            list[str]: Output path of model file.
         Notes:
-            The model's client and generator attributes are reset to None.
+            The model's client, generator and loggers are reset to None.
         """
         self.client = None
         self.generator = None
-        joblib.dump(self, output_path)
+        self.init_logger()
+        return joblib.dump(self, output_path)
     @staticmethod
     def load(input_path: str) -> EbmModel:
@@ -707,9 +734,12 @@ class EbmModel:
         Loads an EBM model from a joblib serialized file.
         Args:
-        input_path (str): Path to the joblib serialized file containing the EBM model.
+            input_path (str): Path to the joblib serialized file containing the EBM model.
         Returns:
-        EbmModel: The loaded EBM model instance.
+            EbmModel: The loaded EBM model instance.
         """
-        return joblib.load(input_path)
+        ebm_model = joblib.load(input_path)
+        ebm_model.init_logger()
+        return ebm_model

ebm4subjects/embedding_generator.py CHANGED Viewed

@@ -9,7 +9,8 @@ class EmbeddingGenerator:
     A class for generating embeddings using a given SentenceTransformer model.
     Args:
-        model_name (str): The name of the SentenceTransformer model to use.
+        model_name (str, SentenceTransformer): The name of the SentenceTransformer
+            model or an SentenceTransformer model to use.
         embedding_dimensions (int): The dimensionality of the generated embeddings.
         **kwargs: Additional keyword arguments to pass to the model.
@@ -19,7 +20,9 @@ class EmbeddingGenerator:
         model (SentenceTransformer): The SentenceTransformer model instance.
     """
-    def __init__(self, model_name: str, embedding_dimensions: int, **kwargs) -> None:
+    def __init__(
+        self, model_name: str | SentenceTransformer, embedding_dimensions: int, **kwargs
+    ) -> None:
         """
         Initializes the EmbeddingGenerator.
@@ -31,9 +34,13 @@ class EmbeddingGenerator:
         # Create a SentenceTransformer model instance with the given
         # model name and embedding dimensions
-        self.model = SentenceTransformer(
-            model_name, truncate_dim=embedding_dimensions, **kwargs
-        )
+        # or set model to the given SentenceTransformer
+        if type(model_name) is str:
+            self.model = SentenceTransformer(
+                model_name, truncate_dim=embedding_dimensions, **kwargs
+            )
+        else:
+            self.model = model_name
         # Disabel parallelism for tokenizer
         # Needed because process might be already parallelized

{ebm4subjects-0.4.1.dist-info → ebm4subjects-0.5.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ebm4subjects
-Version: 0.4.1
+Version: 0.5.1
 Summary: Embedding Based Matching for Automated Subject Indexing
 Author: Deutsche Nationalbibliothek
 Maintainer-email: Clemens Rietdorf <c.rietdorf@dnb.de>, Maximilian Kähler <m.kaehler@dnb.de>
@@ -13,9 +13,7 @@ Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3
 Requires-Python: >=3.10
 Requires-Dist: duckdb>=1.3.0
-Requires-Dist: flash-attn>=2.8.2
 Requires-Dist: nltk~=3.9.1
-Requires-Dist: pandas>=2.3.0
 Requires-Dist: polars>=1.30.0
 Requires-Dist: pyarrow>=21.0.0
 Requires-Dist: pyoxigraph>=0.4.11
@@ -56,6 +54,7 @@ This design borrows a lot of ideas from lexical matching like Maui [1], Kea [2]
 [2] Frank, E., Paynter, G. W., Witten, I. H., Gutwin, C., & Nevill-Manning, C. G. (1999). Domain-Specific Keyphrase Extraction. Proceedings of the 16 Th International Joint Conference on Artifical Intelligence (IJCAI99), 668–673.
+![Embedding Based Matching Sketch](ebm-sketch.svg)
 ## Why embedding based matching

ebm4subjects-0.5.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,12 @@
+ebm4subjects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+ebm4subjects/analyzer.py,sha256=lqX7AF8WsvwIavgtnmoVQ0i3wzBJJSeH47EiEwoLKGg,1664
+ebm4subjects/chunker.py,sha256=HcEFJtKWHFYZL8DmZcHGXLPGEkCqHZhh_0kSqyYVsdE,6764
+ebm4subjects/duckdb_client.py,sha256=8lDIpj2o2VTEtjHC_vTYrI5-RNXZnWMft45bS6z9B_k,13031
+ebm4subjects/ebm_logging.py,sha256=xkbqeVhSCNuhMwkx2yoIX8_D3z9DcsauZEmHhR1gaS0,5962
+ebm4subjects/ebm_model.py,sha256=PVFtljF3oZK8u0lA6df82lsTdAD8H1Y9CHvWq1jWF2M,29125
+ebm4subjects/embedding_generator.py,sha256=DZhZxkjcsy_4NA62_2V-4UPbIUkg5qMPat_cIgsoIAA,2609
+ebm4subjects/prepare_data.py,sha256=vQ-BdXkIP3iZJdPXol0WDlY8cRFMHkjzzL7oC7EbouE,3084
+ebm4subjects-0.5.1.dist-info/METADATA,sha256=QkOBvOAI49_AUipc3yAH6RVG9OVUs_8jO64Bjfy561U,8274
+ebm4subjects-0.5.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+ebm4subjects-0.5.1.dist-info/licenses/LICENSE,sha256=RpvAZSjULHvoTR_esTlucJ08-zdQydnoqQLbqOh9Ub8,13826
+ebm4subjects-0.5.1.dist-info/RECORD,,

ebm4subjects-0.4.1.dist-info/RECORD DELETED Viewed

@@ -1,12 +0,0 @@
-ebm4subjects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-ebm4subjects/analyzer.py,sha256=kHsM2ZPzOIHp93UbdWtlgWARoH5ZbDueLsw9FJxpomM,1635
-ebm4subjects/chunker.py,sha256=5LMOAHAxm_VlwSQnmVJjBxb4Vrdv7N-ioW8wcC-VvF0,6545
-ebm4subjects/duckdb_client.py,sha256=JS6yyBe2p01cX_apFXjpYtT-w4Ow41HVhF3z9lKvvww,13046
-ebm4subjects/ebm_logging.py,sha256=0tvodIHXdAGPzOXHwQF5lNBZYZTHD33mZrogr1btqV4,6001
-ebm4subjects/ebm_model.py,sha256=sZI1QwKAH6wPPIxKbdLudD6rIJj7RNsDVJhV0fPBICw,28097
-ebm4subjects/embedding_generator.py,sha256=jC4rz4W50tKndxYezD7Kaoqysl8zhN-TbWirxA_WIQc,2354
-ebm4subjects/prepare_data.py,sha256=vQ-BdXkIP3iZJdPXol0WDlY8cRFMHkjzzL7oC7EbouE,3084
-ebm4subjects-0.4.1.dist-info/METADATA,sha256=Oo_YR6zYDnhxWZa7Gp_HZuK7qIFIQWlA3dAbDsze_YE,8285
-ebm4subjects-0.4.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-ebm4subjects-0.4.1.dist-info/licenses/LICENSE,sha256=RpvAZSjULHvoTR_esTlucJ08-zdQydnoqQLbqOh9Ub8,13826
-ebm4subjects-0.4.1.dist-info/RECORD,,

{ebm4subjects-0.4.1.dist-info → ebm4subjects-0.5.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{ebm4subjects-0.4.1.dist-info → ebm4subjects-0.5.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

ebm4subjects 0.4.1__py3-none-any.whl → 0.5.1__py3-none-any.whl

ebm4subjects 0.4.1py3-none-any.whl → 0.5.1py3-none-any.whl