bisheng-langchain 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. bisheng_langchain/__init__.py +0 -0
  2. bisheng_langchain/chains/__init__.py +5 -0
  3. bisheng_langchain/chains/combine_documents/__init__.py +0 -0
  4. bisheng_langchain/chains/combine_documents/stuff.py +56 -0
  5. bisheng_langchain/chains/question_answering/__init__.py +240 -0
  6. bisheng_langchain/chains/retrieval_qa/__init__.py +0 -0
  7. bisheng_langchain/chains/retrieval_qa/base.py +89 -0
  8. bisheng_langchain/chat_models/__init__.py +11 -0
  9. bisheng_langchain/chat_models/host_llm.py +409 -0
  10. bisheng_langchain/chat_models/interface/__init__.py +10 -0
  11. bisheng_langchain/chat_models/interface/minimax.py +123 -0
  12. bisheng_langchain/chat_models/interface/openai.py +68 -0
  13. bisheng_langchain/chat_models/interface/types.py +61 -0
  14. bisheng_langchain/chat_models/interface/utils.py +5 -0
  15. bisheng_langchain/chat_models/interface/wenxin.py +114 -0
  16. bisheng_langchain/chat_models/interface/xunfei.py +233 -0
  17. bisheng_langchain/chat_models/interface/zhipuai.py +81 -0
  18. bisheng_langchain/chat_models/minimax.py +354 -0
  19. bisheng_langchain/chat_models/proxy_llm.py +354 -0
  20. bisheng_langchain/chat_models/wenxin.py +349 -0
  21. bisheng_langchain/chat_models/xunfeiai.py +355 -0
  22. bisheng_langchain/chat_models/zhipuai.py +379 -0
  23. bisheng_langchain/document_loaders/__init__.py +3 -0
  24. bisheng_langchain/document_loaders/elem_html.py +0 -0
  25. bisheng_langchain/document_loaders/elem_image.py +0 -0
  26. bisheng_langchain/document_loaders/elem_pdf.py +655 -0
  27. bisheng_langchain/document_loaders/parsers/__init__.py +5 -0
  28. bisheng_langchain/document_loaders/parsers/image.py +28 -0
  29. bisheng_langchain/document_loaders/parsers/test_image.py +286 -0
  30. bisheng_langchain/embeddings/__init__.py +7 -0
  31. bisheng_langchain/embeddings/host_embedding.py +133 -0
  32. bisheng_langchain/embeddings/interface/__init__.py +3 -0
  33. bisheng_langchain/embeddings/interface/types.py +23 -0
  34. bisheng_langchain/embeddings/interface/wenxin.py +86 -0
  35. bisheng_langchain/embeddings/wenxin.py +139 -0
  36. bisheng_langchain/vectorstores/__init__.py +3 -0
  37. bisheng_langchain/vectorstores/elastic_keywords_search.py +284 -0
  38. bisheng_langchain-0.0.1.dist-info/METADATA +64 -0
  39. bisheng_langchain-0.0.1.dist-info/RECORD +41 -0
  40. bisheng_langchain-0.0.1.dist-info/WHEEL +5 -0
  41. bisheng_langchain-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,284 @@
1
+ """Wrapper around Elasticsearch vector database."""
2
+ from __future__ import annotations
3
+
4
+ import uuid
5
+ from abc import ABC
6
+ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple
7
+
8
+ import jieba.analyse
9
+ from langchain.docstore.document import Document
10
+ from langchain.embeddings.base import Embeddings
11
+ from langchain.utils import get_from_dict_or_env
12
+ from langchain.vectorstores.base import VectorStore
13
+
14
+ if TYPE_CHECKING:
15
+ from elasticsearch import Elasticsearch # noqa: F401
16
+
17
+
18
+ def _default_text_mapping() -> Dict:
19
+ return {'properties': {'text': {'type': 'text'}}}
20
+
21
+
22
+ # ElasticKeywordsSearch is a concrete implementation of the abstract base class
23
+ # VectorStore, which defines a common interface for all vector database
24
+ # implementations. By inheriting from the ABC class, ElasticKeywordsSearch can be
25
+ # defined as an abstract base class itself, allowing the creation of subclasses with
26
+ # their own specific implementations. If you plan to subclass ElasticKeywordsSearch,
27
+ # you can inherit from it and define your own implementation of the necessary methods
28
+ # and attributes.
29
+ class ElasticKeywordsSearch(VectorStore, ABC):
30
+ """Wrapper around Elasticsearch as a vector database.
31
+
32
+ To connect to an Elasticsearch instance that does not require
33
+ login credentials, pass the Elasticsearch URL and index name along with the
34
+
35
+ Example:
36
+ .. code-block:: python
37
+
38
+ from langchain import ElasticKeywordsSearch
39
+
40
+ elastic_vector_search = ElasticKeywordsSearch(
41
+ elasticsearch_url="http://localhost:9200",
42
+ index_name="test_index",
43
+ )
44
+
45
+
46
+ To connect to an Elasticsearch instance that requires login credentials,
47
+ including Elastic Cloud, use the Elasticsearch URL format
48
+ https://username:password@es_host:9243. For example, to connect to Elastic
49
+ Cloud, create the Elasticsearch URL with the required authentication details and
50
+ pass it to the ElasticKeywordsSearch constructor as the named parameter
51
+ elasticsearch_url.
52
+
53
+ You can obtain your Elastic Cloud URL and login credentials by logging in to the
54
+ Elastic Cloud console at https://cloud.elastic.co, selecting your deployment, and
55
+ navigating to the "Deployments" page.
56
+
57
+ To obtain your Elastic Cloud password for the default "elastic" user:
58
+
59
+ 1. Log in to the Elastic Cloud console at https://cloud.elastic.co
60
+ 2. Go to "Security" > "Users"
61
+ 3. Locate the "elastic" user and click "Edit"
62
+ 4. Click "Reset password"
63
+ 5. Follow the prompts to reset the password
64
+
65
+ The format for Elastic Cloud URLs is
66
+ https://username:password@cluster_id.region_id.gcp.cloud.es.io:9243.
67
+
68
+ Example:
69
+ .. code-block:: python
70
+
71
+ from langchain import ElasticKeywordsSearch
72
+ elastic_host = "cluster_id.region_id.gcp.cloud.es.io"
73
+ elasticsearch_url = f"https://username:password@{elastic_host}:9243"
74
+ elastic_keywords_search = ElasticKeywordsSearch(
75
+ elasticsearch_url=elasticsearch_url,
76
+ index_name="test_index"
77
+ )
78
+
79
+ Args:
80
+ elasticsearch_url (str): The URL for the Elasticsearch instance.
81
+ index_name (str): The name of the Elasticsearch index for the keywords.
82
+
83
+ Raises:
84
+ ValueError: If the elasticsearch python package is not installed.
85
+ """
86
+
87
+ def __init__(
88
+ self,
89
+ elasticsearch_url: str,
90
+ index_name: str,
91
+ *,
92
+ ssl_verify: Optional[Dict[str, Any]] = None,
93
+ ):
94
+ """Initialize with necessary components."""
95
+ try:
96
+ import elasticsearch
97
+ except ImportError:
98
+ raise ImportError(
99
+ 'Could not import elasticsearch python package. '
100
+ 'Please install it with `pip install elasticsearch`.')
101
+ self.index_name = index_name
102
+ _ssl_verify = ssl_verify or {}
103
+ try:
104
+ self.client = elasticsearch.Elasticsearch(elasticsearch_url,
105
+ **_ssl_verify)
106
+ except ValueError as e:
107
+ raise ValueError(
108
+ f'Your elasticsearch client string is mis-formatted. Got error: {e} '
109
+ )
110
+
111
+ def add_texts(
112
+ self,
113
+ texts: Iterable[str],
114
+ metadatas: Optional[List[dict]] = None,
115
+ ids: Optional[List[str]] = None,
116
+ refresh_indices: bool = True,
117
+ **kwargs: Any,
118
+ ) -> List[str]:
119
+ """Run more texts through the keywords and add to the vectorstore.
120
+
121
+ Args:
122
+ texts: Iterable of strings to add to the vectorstore.
123
+ metadatas: Optional list of metadatas associated with the texts.
124
+ ids: Optional list of unique IDs.
125
+ refresh_indices: bool to refresh ElasticSearch indices
126
+
127
+ Returns:
128
+ List of ids from adding the texts into the vectorstore.
129
+ """
130
+ try:
131
+ from elasticsearch.exceptions import NotFoundError
132
+ from elasticsearch.helpers import bulk
133
+ except ImportError:
134
+ raise ImportError(
135
+ 'Could not import elasticsearch python package. '
136
+ 'Please install it with `pip install elasticsearch`.')
137
+ requests = []
138
+ ids = ids or [str(uuid.uuid4()) for _ in texts]
139
+ mapping = _default_text_mapping()
140
+
141
+ # check to see if the index already exists
142
+ try:
143
+ self.client.indices.get(index=self.index_name)
144
+ except NotFoundError:
145
+ # TODO would be nice to create index before embedding,
146
+ # just to save expensive steps for last
147
+ self.create_index(self.client, self.index_name, mapping)
148
+
149
+ for i, text in enumerate(texts):
150
+ metadata = metadatas[i] if metadatas else {}
151
+ request = {
152
+ '_op_type': 'index',
153
+ '_index': self.index_name,
154
+ 'text': text,
155
+ 'metadata': metadata,
156
+ '_id': ids[i],
157
+ }
158
+ requests.append(request)
159
+ bulk(self.client, requests)
160
+
161
+ if refresh_indices:
162
+ self.client.indices.refresh(index=self.index_name)
163
+ return ids
164
+
165
+ def similarity_search(self,
166
+ query: str,
167
+ k: int = 4,
168
+ query_strategy: str = 'match_phrase',
169
+ must_or_should: str = 'should',
170
+ **kwargs: Any) -> List[Document]:
171
+ assert must_or_should in ['must',
172
+ 'should'], 'only support must and should.'
173
+ keywords = jieba.analyse.extract_tags(query, topK=10, withWeight=False)
174
+ match_query = {'bool': {must_or_should: []}}
175
+ for key in keywords:
176
+ match_query['bool'][must_or_should].append(
177
+ {query_strategy: {
178
+ 'text': key
179
+ }})
180
+ docs_and_scores = self.similarity_search_with_score(match_query, k)
181
+ documents = [d[0] for d in docs_and_scores]
182
+ return documents
183
+
184
+ def similarity_search_with_score(
185
+ self,
186
+ query: str,
187
+ k: int = 4,
188
+ **kwargs: Any) -> List[Tuple[Document, float]]:
189
+ response = self.client_search(self.client,
190
+ self.index_name,
191
+ query,
192
+ size=k)
193
+ hits = [hit for hit in response['hits']['hits']]
194
+ docs_and_scores = [(
195
+ Document(
196
+ page_content=hit['_source']['text'],
197
+ metadata=hit['_source']['metadata'],
198
+ ),
199
+ hit['_score'],
200
+ ) for hit in hits]
201
+ return docs_and_scores
202
+
203
+ @classmethod
204
+ def from_texts(
205
+ cls,
206
+ texts: List[str],
207
+ _: Embeddings,
208
+ metadatas: Optional[List[dict]] = None,
209
+ ids: Optional[List[str]] = None,
210
+ index_name: Optional[str] = None,
211
+ refresh_indices: bool = True,
212
+ **kwargs: Any,
213
+ ) -> ElasticKeywordsSearch:
214
+ """Construct ElasticKeywordsSearch wrapper from raw documents.
215
+
216
+ This is a user-friendly interface that:
217
+ 1. Embeds documents.
218
+ 2. Creates a new index for the embeddings in the Elasticsearch instance.
219
+ 3. Adds the documents to the newly created Elasticsearch index.
220
+
221
+ This is intended to be a quick way to get started.
222
+
223
+ Example:
224
+ .. code-block:: python
225
+
226
+ from langchain import ElasticKeywordsSearch
227
+ from langchain.embeddings import OpenAIEmbeddings
228
+ embeddings = OpenAIEmbeddings()
229
+ elastic_vector_search = ElasticKeywordsSearch.from_texts(
230
+ texts,
231
+ embeddings,
232
+ elasticsearch_url="http://localhost:9200"
233
+ )
234
+ """
235
+ elasticsearch_url = get_from_dict_or_env(kwargs, 'elasticsearch_url',
236
+ 'ELASTICSEARCH_URL')
237
+ if 'elasticsearch_url' in kwargs:
238
+ del kwargs['elasticsearch_url']
239
+ index_name = index_name or uuid.uuid4().hex
240
+ vectorsearch = cls(elasticsearch_url, index_name, **kwargs)
241
+ vectorsearch.add_texts(texts,
242
+ metadatas=metadatas,
243
+ ids=ids,
244
+ refresh_indices=refresh_indices)
245
+ return vectorsearch
246
+
247
+ def create_index(self, client: Any, index_name: str,
248
+ mapping: Dict) -> None:
249
+ version_num = client.info()['version']['number'][0]
250
+ version_num = int(version_num)
251
+ if version_num >= 8:
252
+ client.indices.create(index=index_name, mappings=mapping)
253
+ else:
254
+ client.indices.create(index=index_name, body={'mappings': mapping})
255
+
256
+ def client_search(self, client: Any, index_name: str, script_query: Dict,
257
+ size: int) -> Any:
258
+ version_num = client.info()['version']['number'][0]
259
+ version_num = int(version_num)
260
+ if version_num >= 8:
261
+ response = client.search(index=index_name,
262
+ query=script_query,
263
+ size=size)
264
+ else:
265
+ response = client.search(index=index_name,
266
+ body={
267
+ 'query': script_query,
268
+ 'size': size
269
+ })
270
+ return response
271
+
272
+ def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> None:
273
+ """Delete by vector IDs.
274
+
275
+ Args:
276
+ ids: List of ids to delete.
277
+ """
278
+
279
+ if ids is None:
280
+ raise ValueError('No ids provided to delete.')
281
+
282
+ # TODO: Check if this can be done in bulk
283
+ for id in ids:
284
+ self.client.delete(index=self.index_name, id=id)
@@ -0,0 +1,64 @@
1
+ Metadata-Version: 2.1
2
+ Name: bisheng-langchain
3
+ Version: 0.0.1
4
+ Summary: bisheng langchain modules
5
+ Home-page: https://github.com/dataelement/bisheng
6
+ Author: DataElem
7
+ Author-email: contact@dataelem.com
8
+ License: Apache 2.0
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.6
11
+ Classifier: Programming Language :: Python :: 3.7
12
+ Classifier: Programming Language :: Python :: 3.8
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: License :: OSI Approved :: Apache Software License
15
+ Classifier: Operating System :: OS Independent
16
+ Requires-Python: >=3.6
17
+ Description-Content-Type: text/markdown
18
+ Requires-Dist: langchain
19
+ Requires-Dist: openai
20
+ Requires-Dist: zhipuai
21
+ Requires-Dist: websocket-client
22
+ Requires-Dist: elasticsearch
23
+
24
+ ## What is bisheng-langchain?
25
+
26
+ bisheng-langchain is an open-source langchain extending library built to power building LLM application.
27
+ bisheng-langchain provides more components to support Chinese LLMs and and Chinese based token enviroments for prompt engineering and ICL template.
28
+
29
+
30
+ The project is a sub-module of [bisheng](https://github.com/dataelement/bisheng).
31
+
32
+
33
+ ## Key features
34
+
35
+ - Retrival Enhancement components, like ESIndex, DBIndex, GraphIndex
36
+ - Supporting Open LLMs and embeddings models
37
+ - High performance QAs Chains
38
+ - High Semanticly Chinese token processing
39
+
40
+
41
+ ## Quick start
42
+
43
+ ### Start with Bisheng Platform.
44
+
45
+ We provide a open cloud service for easily use. See [free trial](https://bisheng.dataelem.com/).
46
+
47
+
48
+ ### Install bisheng-langchain
49
+
50
+ - Install from pip: `pip install bisheng-langchain`
51
+ - [Quick Start Guide](https://m7a7tqsztt.feishu.cn/wiki/CTXNwpqGKiMs5FkKlPJcylfonuD)
52
+
53
+
54
+ ## Documentation
55
+
56
+ For guidance on installation, development, deployment, and administration,
57
+ check out [bisheng-langchain Dev Docs](https://dataelem.feishu.cn/wiki/Xaq8wKQjkiYEHNkXuYLc7JkxnPf).
58
+
59
+
60
+ ## Acknowledgments
61
+
62
+ bisheng-langchain adopts dependencies from the following:
63
+
64
+ - Thanks to [langchain](https://github.com/langchain-ai/langchain) for the main framework.
@@ -0,0 +1,41 @@
1
+ bisheng_langchain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ bisheng_langchain/chains/__init__.py,sha256=uCZGGsjlCJsNkQ3VQH9VpqQ-Ny2Ze3WUmEdy9xsdEVQ,123
3
+ bisheng_langchain/chains/combine_documents/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ bisheng_langchain/chains/combine_documents/stuff.py,sha256=z_E_wfhJrAYWcNVRPomPm5fGRDI3hqoC52wcMzgzxVA,2369
5
+ bisheng_langchain/chains/question_answering/__init__.py,sha256=RWbSgTQ0IqZhrXkhaJUKzEXurA9NJE7_6P0zLy0IBjs,8636
6
+ bisheng_langchain/chains/retrieval_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ bisheng_langchain/chains/retrieval_qa/base.py,sha256=1lP3c2A1q7TZkl0G38q7VQ6KjftNaH14zR4x3vHcAvo,3678
8
+ bisheng_langchain/chat_models/__init__.py,sha256=7b3X2_aULq9mSihlds99Q_FfgIo_X4gejsTRWBcP_7k,399
9
+ bisheng_langchain/chat_models/host_llm.py,sha256=B1g8F9hPGuLPv29KPqvO-gcs2Ak-b-kDFL4nHasxCP4,15246
10
+ bisheng_langchain/chat_models/minimax.py,sha256=2ofLC-fpQIwA2MrqJCYj0v73VgkrQMqzv_5KX0xYnfg,13961
11
+ bisheng_langchain/chat_models/proxy_llm.py,sha256=AdvCOPhEPTux7nVx7geXryHdRF9Bcm3pYKfRRMzF89c,13949
12
+ bisheng_langchain/chat_models/wenxin.py,sha256=D9gnVGFovoQBuLvAo9DaVOjrym2elgbVrcBq3KYSd4A,13791
13
+ bisheng_langchain/chat_models/xunfeiai.py,sha256=Guo0oPl-o-LhhzAqnBDgtXYsl-C7M00id3pWj1CYBKM,14013
14
+ bisheng_langchain/chat_models/zhipuai.py,sha256=O4M2u6cIgO-ERKJsjirdAPEfcmDHHXidxvp5DKuZql8,14939
15
+ bisheng_langchain/chat_models/interface/__init__.py,sha256=KwcZMPSxFiXu6joXoZEgq6THxZeDXA8neZcOuLKBpUk,443
16
+ bisheng_langchain/chat_models/interface/minimax.py,sha256=uEh0lI_wa_sfiiJkDTMyiSbjxMjgl392bH828vleogA,4420
17
+ bisheng_langchain/chat_models/interface/openai.py,sha256=6h25vgJWV79tHelJfbKsrbgjNrRq9DYZ30aOxlGIQN4,2078
18
+ bisheng_langchain/chat_models/interface/types.py,sha256=SUCcDTnHeuLEQtTGZW6QCIRDuCTPIbddvV7XEQTI8qI,1134
19
+ bisheng_langchain/chat_models/interface/utils.py,sha256=qww_uYsWDqK7cLuv-KzZmmlg9SZAHOi4R_6I6S4XLIk,65
20
+ bisheng_langchain/chat_models/interface/wenxin.py,sha256=2r5sBjym-fihWgHT8v7LZ0374p3umQtkczLFFSB9Z5s,3998
21
+ bisheng_langchain/chat_models/interface/xunfei.py,sha256=UvOCBzUcXBCSzITCsCJ04_pRcAu31iv-fgcPAiF-UAE,7049
22
+ bisheng_langchain/chat_models/interface/zhipuai.py,sha256=A9yST0a_qCPs5IP67bICL4nhbHaWIzvLiSiuzpCdPGY,2633
23
+ bisheng_langchain/document_loaders/__init__.py,sha256=nxi44U76yvkY7OmwLGRi4Xjz-COPkCdQmmviGAyLADE,81
24
+ bisheng_langchain/document_loaders/elem_html.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
+ bisheng_langchain/document_loaders/elem_image.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
+ bisheng_langchain/document_loaders/elem_pdf.py,sha256=64kUITkrTVJe9CH6IAVSdDVcn2Ekx2PM-jT0cdClXlo,22716
27
+ bisheng_langchain/document_loaders/parsers/__init__.py,sha256=nou6EOshbI3PQ8OgU3Ni2rjmjsZSZXvBVs-DB9tYsCw,67
28
+ bisheng_langchain/document_loaders/parsers/image.py,sha256=7Vx4dD_WiSTojS4TMIJFxfE8nvze0kwNnwTd6f1cLds,938
29
+ bisheng_langchain/document_loaders/parsers/test_image.py,sha256=EJHozq5oFfLBlLL5Lr6XFkrkvSttPpohprs9OjDzAKM,8685
30
+ bisheng_langchain/embeddings/__init__.py,sha256=QMsnkSfVLW70OdplI6X5N38cKXTPC7yP-x1i2A4crfI,234
31
+ bisheng_langchain/embeddings/host_embedding.py,sha256=_a_ggQoxSRgPv3aKZTXIzjsYAwmtPt3AJZ-oEQgcz_A,4422
32
+ bisheng_langchain/embeddings/wenxin.py,sha256=8kYqWuHydx5Cylb_Lmdti0YLHrOM1Qha3eMuVIPitOk,4828
33
+ bisheng_langchain/embeddings/interface/__init__.py,sha256=GNY3tibpRxpAdAfSvQmXBKo0xKSLke_9y4clofi_WOE,98
34
+ bisheng_langchain/embeddings/interface/types.py,sha256=VdurbtsnjCPdlOjPFcK2Mg6r9bJYYHb3tepvkk-y3nM,461
35
+ bisheng_langchain/embeddings/interface/wenxin.py,sha256=5d9gI4enmfkD80s0FHKiDt33O0mwM8Xc5WTubnMUy8c,3104
36
+ bisheng_langchain/vectorstores/__init__.py,sha256=K3xQouSGl05Q0ehFCKZafip-35NzCrv8SCANvfxDpKE,96
37
+ bisheng_langchain/vectorstores/elastic_keywords_search.py,sha256=MrNrY1YdSmqKVpWt1y6VKD3Daw9ZummWx_iOtfUk-TE,10922
38
+ bisheng_langchain-0.0.1.dist-info/METADATA,sha256=wQ8dM4fN0tClJUtUmrBLnKVn_rag4aCG5mjmfyZtDUA,2036
39
+ bisheng_langchain-0.0.1.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
40
+ bisheng_langchain-0.0.1.dist-info/top_level.txt,sha256=Z6pPNyCo4ihyr9iqGQbH8sJiC4dAUwA_mAyGRQB5_Fs,18
41
+ bisheng_langchain-0.0.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: bdist_wheel (0.37.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ bisheng_langchain