swarmauri_parser_bertembedding 0.6.0.dev154__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- swarmauri_parser_bertembedding-0.6.0.dev154/PKG-INFO +20 -0
- swarmauri_parser_bertembedding-0.6.0.dev154/README.md +1 -0
- swarmauri_parser_bertembedding-0.6.0.dev154/pyproject.toml +57 -0
- swarmauri_parser_bertembedding-0.6.0.dev154/swarmauri_parser_bertembedding/BERTEmbeddingParser.py +72 -0
- swarmauri_parser_bertembedding-0.6.0.dev154/swarmauri_parser_bertembedding/__init__.py +12 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: swarmauri_parser_bertembedding
|
|
3
|
+
Version: 0.6.0.dev154
|
|
4
|
+
Summary: Swarmauri Bert Embedding Parser
|
|
5
|
+
License: Apache-2.0
|
|
6
|
+
Author: Jacob Stewart
|
|
7
|
+
Author-email: jacob@swarmauri.com
|
|
8
|
+
Requires-Python: >=3.10,<3.13
|
|
9
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Requires-Dist: swarmauri_base (>=0.6.0.dev154,<0.7.0)
|
|
15
|
+
Requires-Dist: swarmauri_core (>=0.6.0.dev154,<0.7.0)
|
|
16
|
+
Requires-Dist: transformers (>=4.45.0)
|
|
17
|
+
Project-URL: Repository, http://github.com/swarmauri/swarmauri-sdk
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# Swarmauri Example Community Package
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Swarmauri Example Community Package
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "swarmauri_parser_bertembedding"
|
|
3
|
+
version = "0.6.0.dev154"
|
|
4
|
+
description = "Swarmauri Bert Embedding Parser"
|
|
5
|
+
authors = ["Jacob Stewart <jacob@swarmauri.com>"]
|
|
6
|
+
license = "Apache-2.0"
|
|
7
|
+
readme = "README.md"
|
|
8
|
+
repository = "http://github.com/swarmauri/swarmauri-sdk"
|
|
9
|
+
classifiers = [
|
|
10
|
+
"License :: OSI Approved :: Apache Software License",
|
|
11
|
+
"Programming Language :: Python :: 3.10",
|
|
12
|
+
"Programming Language :: Python :: 3.11",
|
|
13
|
+
"Programming Language :: Python :: 3.12"
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[tool.poetry.dependencies]
|
|
17
|
+
python = ">=3.10,<3.13"
|
|
18
|
+
|
|
19
|
+
# Swarmauri
|
|
20
|
+
swarmauri_core = {version = "^0.6.0.dev154"}
|
|
21
|
+
swarmauri_base = {version = "^0.6.0.dev154"}
|
|
22
|
+
|
|
23
|
+
# Dependencies
|
|
24
|
+
transformers = ">=4.45.0"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
[tool.poetry.group.dev.dependencies]
|
|
28
|
+
flake8 = "^7.0"
|
|
29
|
+
pytest = "^8.0"
|
|
30
|
+
pytest-asyncio = ">=0.24.0"
|
|
31
|
+
pytest-xdist = "^3.6.1"
|
|
32
|
+
pytest-json-report = "^1.5.0"
|
|
33
|
+
python-dotenv = "*"
|
|
34
|
+
requests = "^2.32.3"
|
|
35
|
+
|
|
36
|
+
[build-system]
|
|
37
|
+
requires = ["poetry-core>=1.0.0"]
|
|
38
|
+
build-backend = "poetry.core.masonry.api"
|
|
39
|
+
|
|
40
|
+
[tool.pytest.ini_options]
|
|
41
|
+
norecursedirs = ["combined", "scripts"]
|
|
42
|
+
|
|
43
|
+
markers = [
|
|
44
|
+
"test: standard test",
|
|
45
|
+
"unit: Unit tests",
|
|
46
|
+
"integration: Integration tests",
|
|
47
|
+
"acceptance: Acceptance tests",
|
|
48
|
+
"experimental: Experimental tests"
|
|
49
|
+
]
|
|
50
|
+
log_cli = true
|
|
51
|
+
log_cli_level = "INFO"
|
|
52
|
+
log_cli_format = "%(asctime)s [%(levelname)s] %(message)s"
|
|
53
|
+
log_cli_date_format = "%Y-%m-%d %H:%M:%S"
|
|
54
|
+
asyncio_default_fixture_loop_scope = "function"
|
|
55
|
+
|
|
56
|
+
[tool.poetry.plugins."swarmauri.parsers"]
|
|
57
|
+
BERTEmbeddingParser = "swarmauri_parser_bertembedding.BERTEmbeddingParser:BERTEmbeddingParser"
|
swarmauri_parser_bertembedding-0.6.0.dev154/swarmauri_parser_bertembedding/BERTEmbeddingParser.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from typing import List, Union, Any, Literal
|
|
2
|
+
from swarmauri_core.ComponentBase import ComponentBase
|
|
3
|
+
from transformers import BertTokenizer, BertModel
|
|
4
|
+
import torch
|
|
5
|
+
from pydantic import PrivateAttr
|
|
6
|
+
from swarmauri_core.documents.IDocument import IDocument
|
|
7
|
+
from swarmauri_standard.documents.Document import Document
|
|
8
|
+
from swarmauri_base.parsers.ParserBase import ParserBase
|
|
9
|
+
|
|
10
|
+
@ComponentBase.register_type(ParserBase, "BERTEmbeddingParser")
|
|
11
|
+
class BERTEmbeddingParser(ParserBase):
|
|
12
|
+
"""
|
|
13
|
+
A parser that transforms input text into document embeddings using BERT.
|
|
14
|
+
|
|
15
|
+
This parser tokenizes the input text, passes it through a pre-trained BERT model,
|
|
16
|
+
and uses the resulting embeddings as the document content.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
parser_model_name: str = "bert-base-uncased"
|
|
20
|
+
_model: Any = PrivateAttr()
|
|
21
|
+
type: Literal["BERTEmbeddingParser"] = "BERTEmbeddingParser"
|
|
22
|
+
|
|
23
|
+
def __init__(self, **kwargs):
|
|
24
|
+
"""
|
|
25
|
+
Initializes the BERTEmbeddingParser with a specific BERT model.
|
|
26
|
+
|
|
27
|
+
Parameters:
|
|
28
|
+
- model_name (str): The name of the pre-trained BERT model to use.
|
|
29
|
+
"""
|
|
30
|
+
super().__init__(**kwargs)
|
|
31
|
+
self.tokenizer = BertTokenizer.from_pretrained(self.parser_model_name)
|
|
32
|
+
self._model = BertModel.from_pretrained(self.parser_model_name)
|
|
33
|
+
self._model.eval() # Set model to evaluation mode
|
|
34
|
+
|
|
35
|
+
def parse(self, data: Union[str, Any]) -> List[IDocument]:
|
|
36
|
+
"""
|
|
37
|
+
Tokenizes input data and generates embeddings using a BERT model.
|
|
38
|
+
|
|
39
|
+
Parameters:
|
|
40
|
+
- data (Union[str, Any]): Input data, expected to be a single string or batch of strings.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
- List[IDocument]: A list containing a single IDocument instance with BERT embeddings as content.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
# Tokenization
|
|
47
|
+
inputs = self.tokenizer(
|
|
48
|
+
data, return_tensors="pt", padding=True, truncation=True, max_length=512
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Generate embeddings
|
|
52
|
+
with torch.no_grad():
|
|
53
|
+
outputs = self._model(**inputs)
|
|
54
|
+
|
|
55
|
+
# Use the last hidden state as document embeddings (batch_size, sequence_length, hidden_size)
|
|
56
|
+
embeddings = outputs.last_hidden_state
|
|
57
|
+
|
|
58
|
+
# Convert to list of numpy arrays
|
|
59
|
+
embeddings = embeddings.detach().cpu().numpy()
|
|
60
|
+
|
|
61
|
+
# For simplicity, let's consider the mean of embeddings across tokens to represent the document
|
|
62
|
+
doc_embeddings = embeddings.mean(axis=1)
|
|
63
|
+
|
|
64
|
+
# Creating document object(s)
|
|
65
|
+
documents = [
|
|
66
|
+
Document(
|
|
67
|
+
doc_id=str(i), content=emb, metadata={"source": "BERTEmbeddingParser"}
|
|
68
|
+
)
|
|
69
|
+
for i, emb in enumerate(doc_embeddings)
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
return documents
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from .BERTEmbeddingParser import BERTEmbeddingParser
|
|
2
|
+
|
|
3
|
+
__version__ = "0.6.0.dev26"
|
|
4
|
+
__long_desc__ = """
|
|
5
|
+
|
|
6
|
+
# Swarmauri BERTEmbedding Parser Plugin
|
|
7
|
+
|
|
8
|
+
Visit us at: https://swarmauri.com
|
|
9
|
+
Follow us at: https://github.com/swarmauri
|
|
10
|
+
Star us at: https://github.com/swarmauri/swarmauri-sdk
|
|
11
|
+
|
|
12
|
+
"""
|