ddi-fw 0.0.66__py3-none-any.whl → 0.0.67__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,6 @@ from .core import BaseDataset
2
2
  from .ddi_mdl.base import DDIMDLDataset
3
3
  from .mdf_sa_ddi.base import MDFSADDIDataset
4
4
  from .embedding_generator import create_embeddings
5
- from .embedding_generator_new import PoolingStrategy,SumPoolingStrategy,MeanPoolingStrategy,SentenceTransformerDecorator,PretrainedEmbeddings,SBertEmbeddings
6
5
  from .idf_helper import IDF
7
6
  from .feature_vector_generation import SimilarityMatrixGenerator, VectorGenerator
8
7
 
ddi_fw/datasets/core.py CHANGED
@@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
5
5
  import numpy as np
6
6
  import pandas as pd
7
7
  import pathlib
8
- from ddi_fw.datasets.embedding_generator_new import PoolingStrategy
8
+ from ddi_fw.langchain.embeddings import PoolingStrategy
9
9
  from ddi_fw.datasets.idf_helper import IDF
10
10
 
11
11
  from ddi_fw.utils.zip_helper import ZipHelper
@@ -2,7 +2,7 @@ import pathlib
2
2
 
3
3
  import pandas as pd
4
4
 
5
- from ddi_fw.datasets.embedding_generator_new import PoolingStrategy
5
+ from ddi_fw.langchain.embeddings import PoolingStrategy
6
6
  from .. import BaseDataset
7
7
  from ..db_utils import create_connection
8
8
 
@@ -15,7 +15,7 @@ import chromadb
15
15
  from collections import defaultdict
16
16
  from langchain_community.vectorstores import Chroma
17
17
  from ddi_fw.ner.ner import CTakesNER
18
- from ddi_fw.datasets.embedding_generator_new import PoolingStrategy
18
+ from ddi_fw.langchain.embeddings import PoolingStrategy
19
19
 
20
20
  from ddi_fw.datasets import BaseDataset, DDIMDLDataset
21
21
 
@@ -15,7 +15,7 @@ import chromadb
15
15
  from collections import defaultdict
16
16
  from langchain_community.vectorstores import Chroma
17
17
  from ddi_fw.ner.ner import CTakesNER
18
- from ddi_fw.datasets.embedding_generator_new import PoolingStrategy
18
+ from ddi_fw.langchain.embeddings import PoolingStrategy
19
19
 
20
20
  from ddi_fw.datasets import BaseDataset, DDIMDLDataset
21
21
 
@@ -0,0 +1,3 @@
1
+ from ..langchain.embeddings import PoolingStrategy,SumPoolingStrategy,MeanPoolingStrategy,SentenceTransformerDecorator,PretrainedEmbeddings,SBertEmbeddings
2
+ from .sentence_splitter import SentenceSplitter
3
+ from .storage import DataFrameToVectorDB
@@ -58,7 +58,7 @@ class SumPoolingStrategy(PoolingStrategy):
58
58
 
59
59
  class SentenceTransformerDecorator(Embeddings):
60
60
  def __init__(self, model_name="all-MiniLM-L6-v2", **kwargs: Any):
61
- self.embeddings = SentenceTransformerEmbeddings(model_name=model_name)
61
+ self.embeddings = SentenceTransformerEmbeddings(model_name=model_name, **kwargs)
62
62
 
63
63
  def embed_documents(self, texts: List[str]) -> List[List[float]]:
64
64
  return self.embeddings.embed_documents(texts)
@@ -67,7 +67,7 @@ class SentenceTransformerDecorator(Embeddings):
67
67
  return self.embeddings.embed_query(text)
68
68
 
69
69
 
70
- class PretrainedEmbeddings( Embeddings):
70
+ class PretrainedEmbeddings(Embeddings):
71
71
  def __init__(self, model_name):
72
72
  self.mmodel_name = model_name
73
73
  self.model = AutoModel.from_pretrained(model_name)
@@ -0,0 +1,10 @@
1
+ from typing import List
2
+ import nltk
3
+ from nltk import sent_tokenize
4
+ from langchain_text_splitters.base import TextSplitter
5
+
6
+ nltk.download('punkt')
7
+
8
+ class SentenceSplitter(TextSplitter):
9
+ def split_text(self, text: str) -> List[str]:
10
+ return sent_tokenize(text)
@@ -0,0 +1,69 @@
1
+ from langchain.vectorstores import Chroma
2
+ # from langchain_community.vectorstores import Chroma
3
+ from langchain_community.vectorstores.utils import filter_complex_metadata
4
+ from langchain_core.embeddings import Embeddings
5
+
6
+
7
+ from langchain.docstore.document import Document
8
+
9
+ from langchain.document_loaders import DataFrameLoader
10
+
11
+ from langchain.text_splitter import TextSplitter, RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
12
+ from sentence_splitter import SentenceSplitter
13
+
14
+ from langchain_community.document_loaders.hugging_face_dataset import HuggingFaceDatasetLoader
15
+
16
+
17
+ class DataFrameToVectorDB:
18
+ def __init__(self,
19
+ collection_name,
20
+ persist_directory,
21
+ embeddings: Embeddings,
22
+ text_splitter: TextSplitter,
23
+ chunk_size=1000,
24
+ chunk_overlap=20):
25
+ self.collection_name = collection_name
26
+ self.persist_directory = persist_directory
27
+ self.embeddings = embeddings
28
+ self.text_splitter = text_splitter
29
+ self.chunk_size = chunk_size
30
+ self.chunk_overlap = chunk_overlap
31
+ self.vectordb = Chroma(collection_name=collection_name,
32
+ persist_directory=persist_directory,
33
+ embedding_function=embeddings)
34
+
35
+ def __split_docs(self, documents):
36
+ docs = self.text_splitter.split_documents(documents)
37
+ return docs
38
+
39
+ def __split_list(self, input_list, chunk_size):
40
+ for i in range(0, len(input_list), chunk_size):
41
+ yield input_list[i:i + chunk_size]
42
+
43
+ def store(self, df, columns, page_content_columns, max_batch_size=1000):
44
+ for page_content_column in page_content_columns:
45
+ copy_columns = columns.copy()
46
+ copy_columns.append(page_content_column)
47
+ col_df = df[copy_columns].copy()
48
+ col_df.dropna(subset=[page_content_column], inplace=True)
49
+ col_df['type'] = page_content_column # Set the type column
50
+ documents = []
51
+
52
+ loader = DataFrameLoader(data_frame=col_df, page_content_column=page_content_column)
53
+ loaded_docs = loader.load()
54
+ documents.extend(self.__split_docs(loaded_docs))
55
+
56
+ split_docs_chunked = self.__split_list(documents, max_batch_size)
57
+
58
+ for split_docs_chunk in split_docs_chunked:
59
+ # vectordb = Chroma.from_documents(
60
+ # collection_name=collection_name,
61
+ # documents=split_docs_chunk,
62
+ # embedding=embeddings,
63
+ # persist_directory=persist_directory,
64
+ # )
65
+ self.vectordb.add_documents(split_docs_chunk)
66
+ self.vectordb.persist()
67
+
68
+
69
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ddi_fw
3
- Version: 0.0.66
3
+ Version: 0.0.67
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -1,12 +1,11 @@
1
- ddi_fw/datasets/__init__.py,sha256=CqDrx7Ov83pXRh-n0ylembBmzhlW_yFWiheBcISrKdg,510
2
- ddi_fw/datasets/core.py,sha256=ffza6yX3zvZV8Lp7but5f49J0837gZQKCSQ3iMBT6BE,19033
1
+ ddi_fw/datasets/__init__.py,sha256=HSwQrqnzrEjIG4gif41pwJ_cST3t2XHGDxqFyuEBRwo,351
2
+ ddi_fw/datasets/core.py,sha256=lGVP2P8CIeSEG5fH230XV8bLoycblJxBQKYbdMSBITM,19021
3
3
  ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
4
4
  ddi_fw/datasets/embedding_generator.py,sha256=Jqrlv88RCu0Lg812KsA12X0cSaZuxbckJ4LNRKNy_qw,2173
5
- ddi_fw/datasets/embedding_generator_new.py,sha256=GOE-Io6-DBwiUJSkgmxw9ZM1exCYYVu9KyP2dH3gf1o,7506
6
5
  ddi_fw/datasets/feature_vector_generation.py,sha256=EImavcALxkIB0YG_smOzagMNzuWMbK9SaWSKwARx_qU,3254
7
6
  ddi_fw/datasets/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
8
7
  ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
9
- ddi_fw/datasets/ddi_mdl/base.py,sha256=7MMAMJtJqqTXmYiI4Hf7e4kO2xoOynPocHazJrE9Y_w,4261
8
+ ddi_fw/datasets/ddi_mdl/base.py,sha256=ZW8uJIvEizK2x_VkoyhNYcKh3ki3kQRsKxl8d2_hVYQ,4249
10
9
  ddi_fw/datasets/ddi_mdl/readme.md,sha256=WC6lpmsEKvIISnZqENY7TWtzCQr98HPpE3oRsBl8pIw,625
11
10
  ddi_fw/datasets/ddi_mdl/data/event.db,sha256=cmlSsf9MYjRzqR-mw3cUDnTnfT6FkpOG2yCl2mMwwew,30580736
12
11
  ddi_fw/datasets/ddi_mdl/indexes/test_indexes.txt,sha256=XVlDqYATckrQwNSXqMSKVBqyoN_Hg8SK6CL-XMdLADY,102176
@@ -59,11 +58,15 @@ ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHr
59
58
  ddi_fw/experiments/__init__.py,sha256=5L2xSolpFycNnflqOMdvJSiqRB16ExA5bbVGORKFX04,195
60
59
  ddi_fw/experiments/custom_torch_model.py,sha256=iQ_R_EApzD2JCcASN8cie6D21oh7VCxaOQ45_dkiGwc,2576
61
60
  ddi_fw/experiments/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
62
- ddi_fw/experiments/pipeline.py,sha256=dI_yxBt73hGFMxFIVhCRSjrefaGbILfQXCHr9VQ8tI8,5693
63
- ddi_fw/experiments/pipeline_builder_pattern.py,sha256=q1PNEQFoO5U3UidEoGB8rgLA7KXr4FsJTXEug5c5UJg,5466
61
+ ddi_fw/experiments/pipeline.py,sha256=N07EBv2IGa9oD0A1XxvUktDjGHi0SFmt3QqupF2rs3k,5681
62
+ ddi_fw/experiments/pipeline_builder_pattern.py,sha256=w6x7ietk4vONCAvUfssPycaRUQIYUJsbCNNj3BTASBI,5454
64
63
  ddi_fw/experiments/pipeline_ner.py,sha256=unxEJCYrG6wEZjLmqvGdLRTMOBwELbGKkdygSpAR3b8,5043
65
64
  ddi_fw/experiments/tensorflow_helper.py,sha256=xUnbntWyc2Wm4TvmVFAnpwLHg-o13oM26GUHom6d5m0,11776
66
65
  ddi_fw/experiments/test.py,sha256=z1TfBpK75zGKpp2ZU8f6APjZlgBFthaCBN61YB9ma4o,2049
66
+ ddi_fw/langchain/__init__.py,sha256=8dBPZivc01WWaCH8sZ_UV8-XPyo74e9Qy6-fYgAiNLE,248
67
+ ddi_fw/langchain/embeddings.py,sha256=8J_SfO9pyET2W-Ltzq0_r9EchFzBsYdUabiOMma42Us,7515
68
+ ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
69
+ ddi_fw/langchain/storage.py,sha256=xGCpqeRIPU_S2In2JcFW3ixnsdfZ7d3Ux0xJNbV4h6s,2833
67
70
  ddi_fw/ner/__init__.py,sha256=JwhGXrepomxPSsGsg2b_xPRC72AjvxOIn2CW5Mvscn0,26
68
71
  ddi_fw/ner/mmlrestclient.py,sha256=NZta7m2Qm6I_qtVguMZhqtAUjVBmmXn0-TMnsNp0jpg,6859
69
72
  ddi_fw/ner/ner.py,sha256=BEs9AFljAxOQrC2BEP1raSzRoypcfELS5UTdl4bjTqw,15863
@@ -83,7 +86,7 @@ ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
83
86
  ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
84
87
  ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
85
88
  ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
86
- ddi_fw-0.0.66.dist-info/METADATA,sha256=r5ST3WA9Wzm2x7LQOBUpt6E8ltZjimmA-MWrV6n5ubE,1565
87
- ddi_fw-0.0.66.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
88
- ddi_fw-0.0.66.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
89
- ddi_fw-0.0.66.dist-info/RECORD,,
89
+ ddi_fw-0.0.67.dist-info/METADATA,sha256=UvAmId4nJachjVEKPzm1_CRK8N2NiamboJpdosGxzew,1565
90
+ ddi_fw-0.0.67.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
91
+ ddi_fw-0.0.67.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
92
+ ddi_fw-0.0.67.dist-info/RECORD,,