pyxecm 0.0.17__py3-none-any.whl → 0.0.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyxecm might be problematic. Click here for more details.

pyxecm/llm.py DELETED
@@ -1,451 +0,0 @@
1
- """
2
- LLM Module to implement classes to ask questions on a set of documents
3
- stored in Extended ECM. Optionally a chathistory can be preserved and
4
- provided as context for subsequent asks to the LLM.
5
-
6
- Class: ExtendedECMFileLoader
7
- Methods:
8
-
9
- __init__ : class initializer
10
- load: download a document from Extended ECM, convert it to text
11
- and return it
12
-
13
- Class: ExtendedECMLoader
14
- Methods:
15
-
16
- __init__ : class initializer
17
- load_from_folder: load all documents stored in a defined folder. Optionally also recurse sub-folders.
18
- load_from_node_ids: load documents with given node IDs (list)
19
- load: main method to load a set of documents from Extended ECM and
20
- converts them to text chunks and load these into a vector database
21
-
22
-
23
- Class: ExtendedECMChat
24
- Methods:
25
-
26
- __init__ : class initializer
27
- ask: ask the LLM a question about the Extended ECM content
28
-
29
- """
30
-
31
- __author__ = "Dr. Marc Diefenbruch"
32
- __copyright__ = "Copyright 2023, OpenText"
33
- __credits__ = ["Kai-Philip Gatzweiler"]
34
- __maintainer__ = "Dr. Marc Diefenbruch"
35
- __email__ = "mdiefenb@opentext.com"
36
-
37
- import logging
38
- import os
39
- import tempfile
40
-
41
- from typing import List
42
-
43
- from langchain.docstore.document import Document
44
- from langchain.document_loaders.base import BaseLoader
45
- from langchain.document_loaders.unstructured import UnstructuredFileLoader
46
- from langchain.text_splitter import CharacterTextSplitter
47
- from langchain.vectorstores import Chroma
48
- from langchain.embeddings.openai import OpenAIEmbeddings # actually come from OpenAI
49
-
50
- # from langchain.vectorstores import FAISS
51
- from langchain.chains.question_answering import load_qa_chain
52
- from langchain.chains.qa_with_sources import load_qa_with_sources_chain
53
- from langchain.chains import ConversationalRetrievalChain
54
- from langchain.llms import OpenAI
55
-
56
- # from langchain.callbacks import get_openai_callback
57
- from langchain.memory import ConversationBufferMemory
58
-
59
- from pyxecm.otcs import OTCS
60
-
61
- logger = logging.getLogger(os.path.basename(__file__))
62
-
63
- CHROMA_DB_DIRECTORY = "chroma_db/extended_ecm"
64
-
65
-
66
- class ExtendedECMFileLoader(BaseLoader):
67
- _otcs_object: OTCS = None
68
- _node_id = 0
69
-
70
- def __init__(self, otcs_object: object, node_id: int, node_name: str):
71
- """Initialize with path to directory and how to glob over it."""
72
- self._node_id = node_id
73
- self._otcs_object = otcs_object
74
- self._node_name = node_name
75
-
76
- def load(self) -> List[Document]:
77
- """Loads a document from Extended ECM into
78
- a temporary file system location, then
79
- converts it with the "UnstructuredFileLoader" into
80
- a plain text representation
81
-
82
- Returns:
83
- List[Document]: the converted document (list with one item)
84
- """
85
- with tempfile.TemporaryDirectory() as temp_dir:
86
- file_path = f"{temp_dir}/{self._node_name}"
87
-
88
- self._otcs_object.downloadDocument(self._node_id, file_path=file_path)
89
-
90
- try:
91
- loader = UnstructuredFileLoader(file_path)
92
- document = loader.load()
93
- # We want the original Extended ECM URL as source:
94
- if document:
95
- document[0].metadata["source"] = (
96
- self._otcs_object.csUrl() + "/app/nodes/" + str(self._node_id)
97
- )
98
- except Exception as e:
99
- logger.warning(e)
100
- document = []
101
-
102
- return document
103
-
104
-
105
- class ExtendedECMLoader(BaseLoader):
106
- """Load defined document set from Extended ECM."""
107
-
108
- _otcs_object: OTCS = None
109
- _folder_path = []
110
- _node_ids = []
111
-
112
- _supported_mime_types = {
113
- "doc": "application/msword",
114
- "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
115
- "xls": "application/vnd.ms-excel",
116
- "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
117
- "pdf": "application/pdf",
118
- }
119
-
120
- def __init__(
121
- self,
122
- otcs_object: object,
123
- folder_path: list = [],
124
- node_ids: list = [],
125
- supported_mime_types: dict = {},
126
- ):
127
- """Initialize the class variables."""
128
- self._otcs_object = otcs_object
129
- self._folder_path = folder_path
130
- self._node_ids = node_ids
131
-
132
- if supported_mime_types:
133
- self._supported_mime_types = supported_mime_types
134
-
135
- # end method definition
136
-
137
- def load_from_folder(
138
- self, folder_id: int, recursive: bool = True
139
- ) -> List[Document]:
140
- """
141
- Loads all supported document files from the specified folder
142
- in Extended and returns a list of Document objects.
143
-
144
- Args:
145
- folder_id: Node ID of the folder.
146
-
147
- Returns:
148
- List[Document]: A list of Document objects representing
149
- the loaded documents.
150
-
151
- """
152
-
153
- docs = []
154
-
155
- # Load the non-container items (-3 = non-container):
156
- items = self._otcs_object.getSubnodes(folder_id, filter_node_types=-3)
157
- if items:
158
- items = items["results"]
159
-
160
- for item in items:
161
- properties = item["data"]["properties"]
162
- if properties["type"] == 144 and properties["mime_type"] in list(
163
- self._supported_mime_types.values()
164
- ):
165
- item_id = properties["id"]
166
- item_name = properties["name"]
167
- file_loader = ExtendedECMFileLoader(
168
- self._otcs_object, node_id=item_id, node_name=item_name
169
- )
170
- docs.extend(file_loader.load())
171
- logger.info("Number of Documents: {}".format(len(docs)))
172
-
173
- # If we do not want to traverse all sub-folders / workspaces
174
- # we are done and can return here:
175
- if not recursive:
176
- return docs
177
-
178
- # Get children nodes of the current node:
179
- sub_folders = self._otcs_object.getSubnodes(folder_id, filter_node_types=-1)
180
- if sub_folders:
181
- sub_folders = sub_folders["results"]
182
-
183
- for sub_folder in sub_folders:
184
- properties = sub_folder["data"]["properties"]
185
- sub_folder_id = properties["id"]
186
- sub_folder_type = properties["type"]
187
- # We ony traverse folders (type = 0) and workspaces (type = 848):
188
- if sub_folder_type == 0 or sub_folder_type == 848:
189
- docs += self.load_from_folder(sub_folder_id)
190
- logger.info("Number of Documents: {}".format(len(docs)))
191
-
192
- return docs
193
-
194
- # end method definition
195
-
196
- def load_from_node_ids(self) -> List[Document]:
197
- """
198
- Loads all supported document files from Extended ECM
199
- based on their node IDs and returns a list
200
- of Document objects.
201
-
202
- Args:
203
- None
204
-
205
- Returns:
206
- List[Document]: A list of Document objects representing
207
- the loaded documents.
208
- """
209
-
210
- docs = []
211
-
212
- for node_id in self._node_ids:
213
- node = self._otcs_object.getNode(node_id)
214
- if not node:
215
- logging.warning("Cannot find node with ID -> {}".format(node_id))
216
- continue
217
- node_type = self._otcs_object.getResultValue(node, "type")
218
- if not node_type == 144:
219
- continue
220
- document_mime_type = self._otcs_object.getResultValue(node, "mime_type")
221
- if not document_mime_type in list(self._supported_mime_types.values()):
222
- continue
223
-
224
- document_name = self._otcs_object.getResultValue(node, "name")
225
- loader = ExtendedECMFileLoader(
226
- self._otcs_object, node_id=node_id, node_name=document_name
227
- )
228
- docs.extend(loader.load())
229
- logger.info("Number of Documents: {}".format(len(docs)))
230
-
231
- return docs
232
-
233
- # end method definition
234
-
235
- def load(self) -> List[Document]:
236
- """
237
- Loads all supported document files from the specified Extended ECM folder
238
- or node IDs and returns a list of Document objects.
239
-
240
- Returns:
241
- List[Document]: A list of Document objects
242
- representing the loaded documents.
243
- """
244
-
245
- docs: List[Document] = []
246
-
247
- if self._folder_path:
248
- folder = self._otcs_object.getNodeByVolumeAndPath(141, self._folder_path)
249
- if not folder:
250
- return docs
251
- folder_id = self._otcs_object.getResultValue(folder, "id")
252
- docs.extend(self.load_from_folder(folder_id=folder_id))
253
- if self._node_ids:
254
- docs.extend(self.load_from_node_ids())
255
-
256
- return docs
257
-
258
- # end method definition
259
-
260
-
261
- class ExtendedECMChat(object):
262
- """Ask Extended ECM questions about a configurable set of documents
263
-
264
- Args:
265
- object: base class
266
- """
267
-
268
- _otcs_object: OTCS = None
269
- _folder_path = []
270
- _node_ids = []
271
-
272
- _supported_mime_types = {
273
- "doc": "application/msword",
274
- "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
275
- "xls": "application/vnd.ms-excel",
276
- "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
277
- "pdf": "application/pdf",
278
- }
279
-
280
- _vector_store = None
281
- _vector_store_file = None
282
- _document_loader = None
283
- _text_splitter = None
284
- _memory = None
285
- _conversation_memory: bool = False
286
- _show_sources: bool = False
287
- _llm = None
288
- _embeddings = None
289
- _chat_history = []
290
-
291
- def __init__(
292
- self,
293
- otcs_object: object,
294
- folder_path: list = [],
295
- node_ids: list = [],
296
- supported_mime_types: dict = {},
297
- load_vector_store: bool = True,
298
- vector_store_path: str = CHROMA_DB_DIRECTORY,
299
- conversation_memory: bool = False,
300
- show_sources: bool = False,
301
- openai_api_key: str = "",
302
- temperature: float = 0.0
303
- ):
304
- """Initialize the class variables."""
305
- self._otcs_object = otcs_object
306
- self._folder_path = folder_path
307
- self._node_ids = node_ids
308
- if not openai_api_key:
309
- openai_api_key = os.getenv("OPENAI_API_KEY")
310
- self._llm = OpenAI(openai_api_key=openai_api_key, temperature=temperature)
311
- self._vector_store_path = vector_store_path
312
- self._conversation_memory = conversation_memory
313
- self._show_sources = show_sources
314
- if supported_mime_types:
315
- self._supported_mime_types = supported_mime_types
316
-
317
- logger.info(
318
- "Initialize Extended ECM Document Loader with folder path -> {} and node IDs -> {}".format(
319
- self._folder_path, self._node_ids
320
- )
321
- )
322
- self._document_loader = ExtendedECMLoader(
323
- otcs_object,
324
- folder_path=self._folder_path,
325
- node_ids=self._node_ids,
326
- supported_mime_types=self._supported_mime_types,
327
- )
328
-
329
- # Create Embeddings
330
- logger.info("Create embeddings...")
331
- embeddings = OpenAIEmbeddings()
332
-
333
- if load_vector_store and os.path.exists(self._vector_store_path):
334
- logger.info(
335
- "Loading vector store from file -> {}".format(self._vector_store_path)
336
- )
337
- # Here we don't pass in the Extended ECM documents
338
- # but take the chunks from the existing database.
339
- self._vector_store = Chroma(
340
- embedding_function=embeddings,
341
- collection_name="extended_ecm",
342
- persist_directory=self._vector_store_path,
343
- )
344
- else:
345
- logger.info(
346
- "Building new vector store from Extended ECM path -> {} and node IDs -> {}. This will take a while depending on the amount of documents...".format(
347
- self._folder_path, self._node_ids
348
- )
349
- )
350
- documents = self._document_loader.load()
351
-
352
- if not documents:
353
- logger.error("No documents found. Exiting.")
354
- return
355
-
356
- self._text_splitter = CharacterTextSplitter(
357
- separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len
358
- )
359
- logger.info("Split documents into chunks...")
360
- chunks = self._text_splitter.split_documents(documents)
361
-
362
- logger.info("Build vector store and make it persistent...")
363
- self._vector_store = Chroma.from_documents(
364
- chunks,
365
- embeddings,
366
- collection_name="extended_ecm",
367
- persist_directory=self._vector_store_path,
368
- )
369
-
370
- logger.info(
371
- "Save vector store to file -> {}...".format(self._vector_store_path)
372
- )
373
- self._vector_store.persist()
374
-
375
- # if self._conversation_memory:
376
- # logger.info("Initialize Conversational Memory...")
377
- # self._memory = ConversationBufferMemory(
378
- # memory_key="chat_history", return_messages=True
379
- # )
380
-
381
- # end method definition
382
-
383
- def get_llm(self):
384
- return self._llm
385
-
386
- def get_vector_store(self):
387
- return self._vector_store
388
-
389
- def get_text_splitter(self):
390
- return self._text_splitter
391
-
392
- def get_memory(self):
393
- return self._memory
394
-
395
- def ask(self, question: str, with_score: bool = False):
396
- """Aks Extended ECM a question about content.
397
-
398
- Args:
399
- question (str): question to ask
400
- Responses:
401
- dict: Response ["output_text"]
402
-
403
- """
404
-
405
- if not question:
406
- return None
407
-
408
- if with_score:
409
- relevant_documents = self._vector_store.similarity_search_with_score(
410
- question
411
- )
412
- else:
413
- relevant_documents = self._vector_store.similarity_search(question)
414
-
415
- logger.debug("Relevant document chunks -> {}".format(relevant_documents))
416
-
417
- # if self._conversation_memory:
418
- # chain = ConversationalRetrievalChain.from_llm(
419
- # OpenAI(temperature=0),
420
- # self._vector_store.as_retriever(),
421
- # memory=self._memory,
422
- # )
423
- # else:
424
-
425
- if self._show_sources:
426
- chain = load_qa_with_sources_chain(self._llm, chain_type="stuff")
427
- else:
428
- chain = load_qa_chain(self._llm, chain_type="stuff")
429
-
430
- chain_parameters = {
431
- "input_documents": relevant_documents,
432
- "question": question
433
- }
434
- if self._conversation_memory:
435
- chain_parameters["chat_history"] = self._chat_history
436
-
437
- response = chain(
438
- chain_parameters,
439
- return_only_outputs=True,
440
- )
441
- # response = chain(
442
- # {"input_documents": relevant_documents, "question": question},
443
- # return_only_outputs=True,
444
- # )
445
-
446
- if self._conversation_memory:
447
- self._chat_history.append((question, response["output_text"]))
448
-
449
- return response
450
-
451
- # end method definition
@@ -1,19 +0,0 @@
1
- pyxecm/__init__.py,sha256=qNJxrtqrkKtrNeaM8VEQpmkM4Ss6rkN2i4z8FiKDKow,508
2
- pyxecm/k8s.py,sha256=kNGc1kVFYKF9o3dRDdxQ4FMwC4b3QGgFX3SCnMv6k5k,33694
3
- pyxecm/llm.py,sha256=kPXaAdCDgPKRc7k8LknH_LfVcNWGGVU3T_Bittr5i9k,15046
4
- pyxecm/m365.py,sha256=Yi7CQvZhwg9xPBLBh2Tp6Z1ZE6URlsHbj_ydxdpxUgc,77524
5
- pyxecm/main.py,sha256=-Mg3JqtNhMko6E6GekJe4eTWXAJ9HMdjrUF-OST7lUY,50994
6
- pyxecm/otac.py,sha256=sgdqHQu9tBMm5pMGKe5wb1dgMbHfxPGKgn4t5BCv-7E,9554
7
- pyxecm/otcs.py,sha256=HiVyeLz15HQfrZZg1dIhTP-vYou8YzTku69AWdE5mq0,256092
8
- pyxecm/otds.py,sha256=FMOJulUS2X0K9LN5Jk9nCC9gHbVduK9HOSnZmhpEIVI,117829
9
- pyxecm/otiv.py,sha256=i3-z0tJttNkaq1VoOfEkKgcVDjvkixUZeLRkxITom2o,1627
10
- pyxecm/otpd.py,sha256=Djxno-r3XMkz6hb9qSLMecvdSa9RVmu6LpJeteLCx7o,10240
11
- pyxecm/payload.py,sha256=2kRTKXsUfdax4xme7X1_2BBx3jHlPf3zMTv2kl5F6s0,250685
12
- pyxecm/sap.py,sha256=T93T9mfE5HAJSZxF_0Cwvb-y7sNeef7GPgZenucnBok,5986
13
- pyxecm/translate.py,sha256=dEqQAg6ZWcorjgobNnW9p-IP9iOQ6ouklntr4qrLqsI,2718
14
- pyxecm/web.py,sha256=4ITGEQ7vOjMrp79e13k3lFB__q-4k5gDxg4T7kw765A,2719
15
- pyxecm-0.0.17.dist-info/LICENSE,sha256=z5DWWd5cHmQYJnq4BDt1bmVQjuXY1Qsp6y0v5ETCw-s,11360
16
- pyxecm-0.0.17.dist-info/METADATA,sha256=E91ixc0eSBunhiOzmAqcN0TRvsMN5T3VyJdt46mPV6w,1866
17
- pyxecm-0.0.17.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
18
- pyxecm-0.0.17.dist-info/top_level.txt,sha256=TGak3_dYN67ugKFbmRxRG1leDyOt0T7dypjdX4Ij1WE,7
19
- pyxecm-0.0.17.dist-info/RECORD,,