ddi-fw 0.0.66__py3-none-any.whl → 0.0.67__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/datasets/__init__.py +0 -1
- ddi_fw/datasets/core.py +1 -1
- ddi_fw/datasets/ddi_mdl/base.py +1 -1
- ddi_fw/experiments/pipeline.py +1 -1
- ddi_fw/experiments/pipeline_builder_pattern.py +1 -1
- ddi_fw/langchain/__init__.py +3 -0
- ddi_fw/{datasets/embedding_generator_new.py → langchain/embeddings.py} +2 -2
- ddi_fw/langchain/sentence_splitter.py +10 -0
- ddi_fw/langchain/storage.py +69 -0
- {ddi_fw-0.0.66.dist-info → ddi_fw-0.0.67.dist-info}/METADATA +1 -1
- {ddi_fw-0.0.66.dist-info → ddi_fw-0.0.67.dist-info}/RECORD +13 -10
- {ddi_fw-0.0.66.dist-info → ddi_fw-0.0.67.dist-info}/WHEEL +0 -0
- {ddi_fw-0.0.66.dist-info → ddi_fw-0.0.67.dist-info}/top_level.txt +0 -0
ddi_fw/datasets/__init__.py
CHANGED
@@ -2,7 +2,6 @@ from .core import BaseDataset
|
|
2
2
|
from .ddi_mdl.base import DDIMDLDataset
|
3
3
|
from .mdf_sa_ddi.base import MDFSADDIDataset
|
4
4
|
from .embedding_generator import create_embeddings
|
5
|
-
from .embedding_generator_new import PoolingStrategy,SumPoolingStrategy,MeanPoolingStrategy,SentenceTransformerDecorator,PretrainedEmbeddings,SBertEmbeddings
|
6
5
|
from .idf_helper import IDF
|
7
6
|
from .feature_vector_generation import SimilarityMatrixGenerator, VectorGenerator
|
8
7
|
|
ddi_fw/datasets/core.py
CHANGED
@@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
|
|
5
5
|
import numpy as np
|
6
6
|
import pandas as pd
|
7
7
|
import pathlib
|
8
|
-
from ddi_fw.
|
8
|
+
from ddi_fw.langchain.embeddings import PoolingStrategy
|
9
9
|
from ddi_fw.datasets.idf_helper import IDF
|
10
10
|
|
11
11
|
from ddi_fw.utils.zip_helper import ZipHelper
|
ddi_fw/datasets/ddi_mdl/base.py
CHANGED
ddi_fw/experiments/pipeline.py
CHANGED
@@ -15,7 +15,7 @@ import chromadb
|
|
15
15
|
from collections import defaultdict
|
16
16
|
from langchain_community.vectorstores import Chroma
|
17
17
|
from ddi_fw.ner.ner import CTakesNER
|
18
|
-
from ddi_fw.
|
18
|
+
from ddi_fw.langchain.embeddings import PoolingStrategy
|
19
19
|
|
20
20
|
from ddi_fw.datasets import BaseDataset, DDIMDLDataset
|
21
21
|
|
@@ -15,7 +15,7 @@ import chromadb
|
|
15
15
|
from collections import defaultdict
|
16
16
|
from langchain_community.vectorstores import Chroma
|
17
17
|
from ddi_fw.ner.ner import CTakesNER
|
18
|
-
from ddi_fw.
|
18
|
+
from ddi_fw.langchain.embeddings import PoolingStrategy
|
19
19
|
|
20
20
|
from ddi_fw.datasets import BaseDataset, DDIMDLDataset
|
21
21
|
|
@@ -58,7 +58,7 @@ class SumPoolingStrategy(PoolingStrategy):
|
|
58
58
|
|
59
59
|
class SentenceTransformerDecorator(Embeddings):
|
60
60
|
def __init__(self, model_name="all-MiniLM-L6-v2", **kwargs: Any):
|
61
|
-
self.embeddings = SentenceTransformerEmbeddings(model_name=model_name)
|
61
|
+
self.embeddings = SentenceTransformerEmbeddings(model_name=model_name, **kwargs)
|
62
62
|
|
63
63
|
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
64
64
|
return self.embeddings.embed_documents(texts)
|
@@ -67,7 +67,7 @@ class SentenceTransformerDecorator(Embeddings):
|
|
67
67
|
return self.embeddings.embed_query(text)
|
68
68
|
|
69
69
|
|
70
|
-
class PretrainedEmbeddings(
|
70
|
+
class PretrainedEmbeddings(Embeddings):
|
71
71
|
def __init__(self, model_name):
|
72
72
|
self.mmodel_name = model_name
|
73
73
|
self.model = AutoModel.from_pretrained(model_name)
|
@@ -0,0 +1,10 @@
|
|
1
|
+
from typing import List
|
2
|
+
import nltk
|
3
|
+
from nltk import sent_tokenize
|
4
|
+
from langchain_text_splitters.base import TextSplitter
|
5
|
+
|
6
|
+
nltk.download('punkt')
|
7
|
+
|
8
|
+
class SentenceSplitter(TextSplitter):
|
9
|
+
def split_text(self, text: str) -> List[str]:
|
10
|
+
return sent_tokenize(text)
|
@@ -0,0 +1,69 @@
|
|
1
|
+
from langchain.vectorstores import Chroma
|
2
|
+
# from langchain_community.vectorstores import Chroma
|
3
|
+
from langchain_community.vectorstores.utils import filter_complex_metadata
|
4
|
+
from langchain_core.embeddings import Embeddings
|
5
|
+
|
6
|
+
|
7
|
+
from langchain.docstore.document import Document
|
8
|
+
|
9
|
+
from langchain.document_loaders import DataFrameLoader
|
10
|
+
|
11
|
+
from langchain.text_splitter import TextSplitter, RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
|
12
|
+
from sentence_splitter import SentenceSplitter
|
13
|
+
|
14
|
+
from langchain_community.document_loaders.hugging_face_dataset import HuggingFaceDatasetLoader
|
15
|
+
|
16
|
+
|
17
|
+
class DataFrameToVectorDB:
|
18
|
+
def __init__(self,
|
19
|
+
collection_name,
|
20
|
+
persist_directory,
|
21
|
+
embeddings: Embeddings,
|
22
|
+
text_splitter: TextSplitter,
|
23
|
+
chunk_size=1000,
|
24
|
+
chunk_overlap=20):
|
25
|
+
self.collection_name = collection_name
|
26
|
+
self.persist_directory = persist_directory
|
27
|
+
self.embeddings = embeddings
|
28
|
+
self.text_splitter = text_splitter
|
29
|
+
self.chunk_size = chunk_size
|
30
|
+
self.chunk_overlap = chunk_overlap
|
31
|
+
self.vectordb = Chroma(collection_name=collection_name,
|
32
|
+
persist_directory=persist_directory,
|
33
|
+
embedding_function=embeddings)
|
34
|
+
|
35
|
+
def __split_docs(self, documents):
|
36
|
+
docs = self.text_splitter.split_documents(documents)
|
37
|
+
return docs
|
38
|
+
|
39
|
+
def __split_list(self, input_list, chunk_size):
|
40
|
+
for i in range(0, len(input_list), chunk_size):
|
41
|
+
yield input_list[i:i + chunk_size]
|
42
|
+
|
43
|
+
def store(self, df, columns, page_content_columns, max_batch_size=1000):
|
44
|
+
for page_content_column in page_content_columns:
|
45
|
+
copy_columns = columns.copy()
|
46
|
+
copy_columns.append(page_content_column)
|
47
|
+
col_df = df[copy_columns].copy()
|
48
|
+
col_df.dropna(subset=[page_content_column], inplace=True)
|
49
|
+
col_df['type'] = page_content_column # Set the type column
|
50
|
+
documents = []
|
51
|
+
|
52
|
+
loader = DataFrameLoader(data_frame=col_df, page_content_column=page_content_column)
|
53
|
+
loaded_docs = loader.load()
|
54
|
+
documents.extend(self.__split_docs(loaded_docs))
|
55
|
+
|
56
|
+
split_docs_chunked = self.__split_list(documents, max_batch_size)
|
57
|
+
|
58
|
+
for split_docs_chunk in split_docs_chunked:
|
59
|
+
# vectordb = Chroma.from_documents(
|
60
|
+
# collection_name=collection_name,
|
61
|
+
# documents=split_docs_chunk,
|
62
|
+
# embedding=embeddings,
|
63
|
+
# persist_directory=persist_directory,
|
64
|
+
# )
|
65
|
+
self.vectordb.add_documents(split_docs_chunk)
|
66
|
+
self.vectordb.persist()
|
67
|
+
|
68
|
+
|
69
|
+
|
@@ -1,12 +1,11 @@
|
|
1
|
-
ddi_fw/datasets/__init__.py,sha256=
|
2
|
-
ddi_fw/datasets/core.py,sha256=
|
1
|
+
ddi_fw/datasets/__init__.py,sha256=HSwQrqnzrEjIG4gif41pwJ_cST3t2XHGDxqFyuEBRwo,351
|
2
|
+
ddi_fw/datasets/core.py,sha256=lGVP2P8CIeSEG5fH230XV8bLoycblJxBQKYbdMSBITM,19021
|
3
3
|
ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
|
4
4
|
ddi_fw/datasets/embedding_generator.py,sha256=Jqrlv88RCu0Lg812KsA12X0cSaZuxbckJ4LNRKNy_qw,2173
|
5
|
-
ddi_fw/datasets/embedding_generator_new.py,sha256=GOE-Io6-DBwiUJSkgmxw9ZM1exCYYVu9KyP2dH3gf1o,7506
|
6
5
|
ddi_fw/datasets/feature_vector_generation.py,sha256=EImavcALxkIB0YG_smOzagMNzuWMbK9SaWSKwARx_qU,3254
|
7
6
|
ddi_fw/datasets/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
|
8
7
|
ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
|
9
|
-
ddi_fw/datasets/ddi_mdl/base.py,sha256=
|
8
|
+
ddi_fw/datasets/ddi_mdl/base.py,sha256=ZW8uJIvEizK2x_VkoyhNYcKh3ki3kQRsKxl8d2_hVYQ,4249
|
10
9
|
ddi_fw/datasets/ddi_mdl/readme.md,sha256=WC6lpmsEKvIISnZqENY7TWtzCQr98HPpE3oRsBl8pIw,625
|
11
10
|
ddi_fw/datasets/ddi_mdl/data/event.db,sha256=cmlSsf9MYjRzqR-mw3cUDnTnfT6FkpOG2yCl2mMwwew,30580736
|
12
11
|
ddi_fw/datasets/ddi_mdl/indexes/test_indexes.txt,sha256=XVlDqYATckrQwNSXqMSKVBqyoN_Hg8SK6CL-XMdLADY,102176
|
@@ -59,11 +58,15 @@ ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHr
|
|
59
58
|
ddi_fw/experiments/__init__.py,sha256=5L2xSolpFycNnflqOMdvJSiqRB16ExA5bbVGORKFX04,195
|
60
59
|
ddi_fw/experiments/custom_torch_model.py,sha256=iQ_R_EApzD2JCcASN8cie6D21oh7VCxaOQ45_dkiGwc,2576
|
61
60
|
ddi_fw/experiments/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
|
62
|
-
ddi_fw/experiments/pipeline.py,sha256=
|
63
|
-
ddi_fw/experiments/pipeline_builder_pattern.py,sha256=
|
61
|
+
ddi_fw/experiments/pipeline.py,sha256=N07EBv2IGa9oD0A1XxvUktDjGHi0SFmt3QqupF2rs3k,5681
|
62
|
+
ddi_fw/experiments/pipeline_builder_pattern.py,sha256=w6x7ietk4vONCAvUfssPycaRUQIYUJsbCNNj3BTASBI,5454
|
64
63
|
ddi_fw/experiments/pipeline_ner.py,sha256=unxEJCYrG6wEZjLmqvGdLRTMOBwELbGKkdygSpAR3b8,5043
|
65
64
|
ddi_fw/experiments/tensorflow_helper.py,sha256=xUnbntWyc2Wm4TvmVFAnpwLHg-o13oM26GUHom6d5m0,11776
|
66
65
|
ddi_fw/experiments/test.py,sha256=z1TfBpK75zGKpp2ZU8f6APjZlgBFthaCBN61YB9ma4o,2049
|
66
|
+
ddi_fw/langchain/__init__.py,sha256=8dBPZivc01WWaCH8sZ_UV8-XPyo74e9Qy6-fYgAiNLE,248
|
67
|
+
ddi_fw/langchain/embeddings.py,sha256=8J_SfO9pyET2W-Ltzq0_r9EchFzBsYdUabiOMma42Us,7515
|
68
|
+
ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
|
69
|
+
ddi_fw/langchain/storage.py,sha256=xGCpqeRIPU_S2In2JcFW3ixnsdfZ7d3Ux0xJNbV4h6s,2833
|
67
70
|
ddi_fw/ner/__init__.py,sha256=JwhGXrepomxPSsGsg2b_xPRC72AjvxOIn2CW5Mvscn0,26
|
68
71
|
ddi_fw/ner/mmlrestclient.py,sha256=NZta7m2Qm6I_qtVguMZhqtAUjVBmmXn0-TMnsNp0jpg,6859
|
69
72
|
ddi_fw/ner/ner.py,sha256=BEs9AFljAxOQrC2BEP1raSzRoypcfELS5UTdl4bjTqw,15863
|
@@ -83,7 +86,7 @@ ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
|
|
83
86
|
ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
|
84
87
|
ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
|
85
88
|
ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
|
86
|
-
ddi_fw-0.0.
|
87
|
-
ddi_fw-0.0.
|
88
|
-
ddi_fw-0.0.
|
89
|
-
ddi_fw-0.0.
|
89
|
+
ddi_fw-0.0.67.dist-info/METADATA,sha256=UvAmId4nJachjVEKPzm1_CRK8N2NiamboJpdosGxzew,1565
|
90
|
+
ddi_fw-0.0.67.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
|
91
|
+
ddi_fw-0.0.67.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
|
92
|
+
ddi_fw-0.0.67.dist-info/RECORD,,
|
File without changes
|
File without changes
|