awslabs.cdk-mcp-server 0.0.10417__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- awslabs/__init__.py +2 -0
- awslabs/cdk_mcp_server/__init__.py +8 -0
- awslabs/cdk_mcp_server/core/__init__.py +1 -0
- awslabs/cdk_mcp_server/core/resources.py +271 -0
- awslabs/cdk_mcp_server/core/search_utils.py +182 -0
- awslabs/cdk_mcp_server/core/server.py +74 -0
- awslabs/cdk_mcp_server/core/tools.py +324 -0
- awslabs/cdk_mcp_server/data/__init__.py +1 -0
- awslabs/cdk_mcp_server/data/cdk_nag_parser.py +331 -0
- awslabs/cdk_mcp_server/data/construct_descriptions.py +32 -0
- awslabs/cdk_mcp_server/data/genai_cdk_loader.py +423 -0
- awslabs/cdk_mcp_server/data/lambda_powertools_loader.py +48 -0
- awslabs/cdk_mcp_server/data/schema_generator.py +666 -0
- awslabs/cdk_mcp_server/data/solutions_constructs_parser.py +782 -0
- awslabs/cdk_mcp_server/server.py +7 -0
- awslabs/cdk_mcp_server/static/CDK_GENERAL_GUIDANCE.md +232 -0
- awslabs/cdk_mcp_server/static/CDK_NAG_GUIDANCE.md +192 -0
- awslabs/cdk_mcp_server/static/__init__.py +5 -0
- awslabs/cdk_mcp_server/static/bedrock/agent/actiongroups.md +137 -0
- awslabs/cdk_mcp_server/static/bedrock/agent/alias.md +39 -0
- awslabs/cdk_mcp_server/static/bedrock/agent/collaboration.md +91 -0
- awslabs/cdk_mcp_server/static/bedrock/agent/creation.md +149 -0
- awslabs/cdk_mcp_server/static/bedrock/agent/custom_orchestration.md +74 -0
- awslabs/cdk_mcp_server/static/bedrock/agent/overview.md +78 -0
- awslabs/cdk_mcp_server/static/bedrock/agent/prompt_override.md +70 -0
- awslabs/cdk_mcp_server/static/bedrock/bedrockguardrails.md +188 -0
- awslabs/cdk_mcp_server/static/bedrock/knowledgebases/chunking.md +137 -0
- awslabs/cdk_mcp_server/static/bedrock/knowledgebases/datasources.md +225 -0
- awslabs/cdk_mcp_server/static/bedrock/knowledgebases/kendra.md +81 -0
- awslabs/cdk_mcp_server/static/bedrock/knowledgebases/overview.md +116 -0
- awslabs/cdk_mcp_server/static/bedrock/knowledgebases/parsing.md +36 -0
- awslabs/cdk_mcp_server/static/bedrock/knowledgebases/transformation.md +30 -0
- awslabs/cdk_mcp_server/static/bedrock/knowledgebases/vector/aurora.md +185 -0
- awslabs/cdk_mcp_server/static/bedrock/knowledgebases/vector/creation.md +80 -0
- awslabs/cdk_mcp_server/static/bedrock/knowledgebases/vector/opensearch.md +56 -0
- awslabs/cdk_mcp_server/static/bedrock/knowledgebases/vector/pinecone.md +66 -0
- awslabs/cdk_mcp_server/static/bedrock/profiles.md +153 -0
- awslabs/cdk_mcp_server/static/genai_cdk/bedrock/agent/actiongroups.md +137 -0
- awslabs/cdk_mcp_server/static/genai_cdk/bedrock/agent/alias.md +39 -0
- awslabs/cdk_mcp_server/static/genai_cdk/bedrock/agent/collaboration.md +91 -0
- awslabs/cdk_mcp_server/static/genai_cdk/bedrock/agent/creation.md +149 -0
- awslabs/cdk_mcp_server/static/genai_cdk/bedrock/agent/custom_orchestration.md +74 -0
- awslabs/cdk_mcp_server/static/genai_cdk/bedrock/agent/overview.md +78 -0
- awslabs/cdk_mcp_server/static/genai_cdk/bedrock/agent/prompt_override.md +70 -0
- awslabs/cdk_mcp_server/static/genai_cdk/bedrock/bedrockguardrails.md +188 -0
- awslabs/cdk_mcp_server/static/genai_cdk/bedrock/knowledgebases/chunking.md +137 -0
- awslabs/cdk_mcp_server/static/genai_cdk/bedrock/knowledgebases/datasources.md +225 -0
- awslabs/cdk_mcp_server/static/genai_cdk/bedrock/knowledgebases/kendra.md +81 -0
- awslabs/cdk_mcp_server/static/genai_cdk/bedrock/knowledgebases/overview.md +116 -0
- awslabs/cdk_mcp_server/static/genai_cdk/bedrock/knowledgebases/parsing.md +36 -0
- awslabs/cdk_mcp_server/static/genai_cdk/bedrock/knowledgebases/transformation.md +30 -0
- awslabs/cdk_mcp_server/static/genai_cdk/bedrock/knowledgebases/vector/aurora.md +185 -0
- awslabs/cdk_mcp_server/static/genai_cdk/bedrock/knowledgebases/vector/creation.md +80 -0
- awslabs/cdk_mcp_server/static/genai_cdk/bedrock/knowledgebases/vector/opensearch.md +56 -0
- awslabs/cdk_mcp_server/static/genai_cdk/bedrock/knowledgebases/vector/pinecone.md +66 -0
- awslabs/cdk_mcp_server/static/genai_cdk/bedrock/profiles.md +153 -0
- awslabs/cdk_mcp_server/static/genai_cdk/opensearch-vectorindex/overview.md +135 -0
- awslabs/cdk_mcp_server/static/genai_cdk/opensearchserverless/overview.md +17 -0
- awslabs/cdk_mcp_server/static/lambda_powertools/bedrock.md +127 -0
- awslabs/cdk_mcp_server/static/lambda_powertools/cdk.md +99 -0
- awslabs/cdk_mcp_server/static/lambda_powertools/dependencies.md +45 -0
- awslabs/cdk_mcp_server/static/lambda_powertools/index.md +36 -0
- awslabs/cdk_mcp_server/static/lambda_powertools/insights.md +95 -0
- awslabs/cdk_mcp_server/static/lambda_powertools/logging.md +43 -0
- awslabs/cdk_mcp_server/static/lambda_powertools/metrics.md +93 -0
- awslabs/cdk_mcp_server/static/lambda_powertools/tracing.md +63 -0
- awslabs/cdk_mcp_server/static/opensearch-vectorindex/overview.md +135 -0
- awslabs/cdk_mcp_server/static/opensearchserverless/overview.md +17 -0
- awslabs_cdk_mcp_server-0.0.10417.dist-info/METADATA +14 -0
- awslabs_cdk_mcp_server-0.0.10417.dist-info/RECORD +72 -0
- awslabs_cdk_mcp_server-0.0.10417.dist-info/WHEEL +4 -0
- awslabs_cdk_mcp_server-0.0.10417.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# Vector Knowledge Base - Chunking Strategies
|
|
2
|
+
|
|
3
|
+
## Available Strategies
|
|
4
|
+
|
|
5
|
+
### Default Chunking
|
|
6
|
+
|
|
7
|
+
Applies Fixed Chunking with the default chunk size of 300 tokens and 20% overlap.
|
|
8
|
+
|
|
9
|
+
#### TypeScript
|
|
10
|
+
|
|
11
|
+
```ts
|
|
12
|
+
ChunkingStrategy.DEFAULT;
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
#### Python
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
ChunkingStrategy.DEFAULT
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
### Fixed Size Chunking
|
|
22
|
+
|
|
23
|
+
This method divides the data into fixed-size chunks, with each chunk
|
|
24
|
+
containing a predetermined number of tokens. This strategy is useful when the data is uniform
|
|
25
|
+
in size and structure.
|
|
26
|
+
|
|
27
|
+
#### TypeScript
|
|
28
|
+
|
|
29
|
+
```ts
|
|
30
|
+
// Fixed Size Chunking with sane defaults.
|
|
31
|
+
ChunkingStrategy.FIXED_SIZE;
|
|
32
|
+
|
|
33
|
+
// Fixed Size Chunking with custom values.
|
|
34
|
+
ChunkingStrategy.fixedSize({ maxTokens: 200, overlapPercentage: 25 });
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
#### Python
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
# Fixed Size Chunking with sane defaults.
|
|
41
|
+
ChunkingStrategy.FIXED_SIZE
|
|
42
|
+
|
|
43
|
+
# Fixed Size Chunking with custom values.
|
|
44
|
+
ChunkingStrategy.fixed_size(
|
|
45
|
+
max_tokens= 200,
|
|
46
|
+
overlap_percentage= 25
|
|
47
|
+
)
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### Hierarchical Chunking
|
|
51
|
+
|
|
52
|
+
This strategy organizes data into layers of chunks, with the first
|
|
53
|
+
layer containing large chunks and the second layer containing smaller chunks derived from the first.
|
|
54
|
+
It is ideal for data with inherent hierarchies or nested structures.
|
|
55
|
+
|
|
56
|
+
#### TypeScript
|
|
57
|
+
|
|
58
|
+
```ts
|
|
59
|
+
// Hierarchical Chunking with the default for Cohere Models.
|
|
60
|
+
ChunkingStrategy.HIERARCHICAL_COHERE;
|
|
61
|
+
|
|
62
|
+
// Hierarchical Chunking with the default for Titan Models.
|
|
63
|
+
ChunkingStrategy.HIERARCHICAL_TITAN;
|
|
64
|
+
|
|
65
|
+
// Hierarchical Chunking with custom values. Tthe maximum chunk size depends on the model.
|
|
66
|
+
// Amazon Titan Text Embeddings: 8192. Cohere Embed models: 512
|
|
67
|
+
ChunkingStrategy.hierarchical({
|
|
68
|
+
overlapTokens: 60,
|
|
69
|
+
maxParentTokenSize: 1500,
|
|
70
|
+
maxChildTokenSize: 300,
|
|
71
|
+
});
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
#### Python
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
# Hierarchical Chunking with the default for Cohere Models.
|
|
78
|
+
ChunkingStrategy.HIERARCHICAL_COHERE
|
|
79
|
+
|
|
80
|
+
# Hierarchical Chunking with the default for Titan Models.
|
|
81
|
+
ChunkingStrategy.HIERARCHICAL_TITAN
|
|
82
|
+
|
|
83
|
+
# Hierarchical Chunking with custom values. Tthe maximum chunk size depends on the model.
|
|
84
|
+
# Amazon Titan Text Embeddings: 8192. Cohere Embed models: 512
|
|
85
|
+
chunking_strategy= ChunkingStrategy.hierarchical(
|
|
86
|
+
overlap_tokens=60,
|
|
87
|
+
max_parent_token_size=1500,
|
|
88
|
+
max_child_token_size=300
|
|
89
|
+
)
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Semantic Chunking
|
|
93
|
+
|
|
94
|
+
This method splits data into smaller documents based on groups of similar
|
|
95
|
+
content derived from the text using natural language processing. It helps preserve contextual
|
|
96
|
+
relationships and ensures accurate and contextually appropriate results.
|
|
97
|
+
|
|
98
|
+
#### TypeScript
|
|
99
|
+
|
|
100
|
+
```ts
|
|
101
|
+
// Semantic Chunking with sane defaults.
|
|
102
|
+
ChunkingStrategy.SEMANTIC;
|
|
103
|
+
|
|
104
|
+
// Semantic Chunking with custom values.
|
|
105
|
+
ChunkingStrategy.semantic({ bufferSize: 0, breakpointPercentileThreshold: 95, maxTokens: 300 });
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
#### Python
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
# Semantic Chunking with sane defaults.
|
|
112
|
+
ChunkingStrategy.SEMANTIC
|
|
113
|
+
|
|
114
|
+
# Semantic Chunking with custom values.
|
|
115
|
+
ChunkingStrategy.semantic(
|
|
116
|
+
buffer_size=0,
|
|
117
|
+
breakpoint_percentile_threshold=95,
|
|
118
|
+
max_tokens=300
|
|
119
|
+
)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### No Chunking
|
|
123
|
+
|
|
124
|
+
This strategy treats each file as one chunk. If you choose this option,
|
|
125
|
+
you may want to pre-process your documents by splitting them into separate files.
|
|
126
|
+
|
|
127
|
+
#### TypeScript
|
|
128
|
+
|
|
129
|
+
```ts
|
|
130
|
+
ChunkingStrategy.NONE;
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
#### Python
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
ChunkingStrategy.NONE
|
|
137
|
+
```
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# Knowledge Base Data Sources
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
This document provides examples of adding various data sources to a Knowledge Base in Amazon Bedrock.
|
|
6
|
+
|
|
7
|
+
## Example
|
|
8
|
+
|
|
9
|
+
### TypeScript
|
|
10
|
+
|
|
11
|
+
```ts
|
|
12
|
+
const app = new cdk.App();
|
|
13
|
+
const stack = new cdk.Stack(app, 'aws-cdk-bedrock-data-sources-integ-test');
|
|
14
|
+
|
|
15
|
+
const kb = new VectorKnowledgeBase(stack, 'MyKnowledgeBase', {
|
|
16
|
+
name: 'MyKnowledgeBase',
|
|
17
|
+
embeddingsModel: BedrockFoundationModel.COHERE_EMBED_MULTILINGUAL_V3,
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
const bucket = new Bucket(stack, 'Bucket', {});
|
|
21
|
+
const lambdaFunction = new Function(stack, 'MyFunction', {
|
|
22
|
+
runtime: cdk.aws_lambda.Runtime.PYTHON_3_9,
|
|
23
|
+
handler: 'index.handler',
|
|
24
|
+
code: cdk.aws_lambda.Code.fromInline('print("Hello, World!")'),
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
const secret = new Secret(stack, 'Secret');
|
|
28
|
+
const key = new Key(stack, 'Key');
|
|
29
|
+
|
|
30
|
+
kb.addWebCrawlerDataSource({
|
|
31
|
+
sourceUrls: ['https://docs.aws.amazon.com/'],
|
|
32
|
+
chunkingStrategy: ChunkingStrategy.HIERARCHICAL_COHERE,
|
|
33
|
+
customTransformation: CustomTransformation.lambda({
|
|
34
|
+
lambdaFunction: lambdaFunction,
|
|
35
|
+
s3BucketUri: `s3://${bucket.bucketName}/chunk-processor/`,
|
|
36
|
+
}),
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
kb.addS3DataSource({
|
|
40
|
+
bucket,
|
|
41
|
+
chunkingStrategy: ChunkingStrategy.SEMANTIC,
|
|
42
|
+
parsingStrategy: ParsingStategy.foundationModel({
|
|
43
|
+
model: BedrockFoundationModel.ANTHROPIC_CLAUDE_SONNET_V1_0,
|
|
44
|
+
}),
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
kb.addConfluenceDataSource({
|
|
48
|
+
dataSourceName: 'TestDataSource',
|
|
49
|
+
authSecret: secret,
|
|
50
|
+
kmsKey: key,
|
|
51
|
+
confluenceUrl: 'https://example.atlassian.net',
|
|
52
|
+
filters: [
|
|
53
|
+
{
|
|
54
|
+
objectType: ConfluenceObjectType.ATTACHMENT,
|
|
55
|
+
includePatterns: ['.*\\.pdf'],
|
|
56
|
+
excludePatterns: ['.*private.*\\.pdf'],
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
objectType: ConfluenceObjectType.PAGE,
|
|
60
|
+
includePatterns: ['.*public.*\\.pdf'],
|
|
61
|
+
excludePatterns: ['.*confidential.*\\.pdf'],
|
|
62
|
+
},
|
|
63
|
+
],
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
kb.addSalesforceDataSource({
|
|
67
|
+
authSecret: secret,
|
|
68
|
+
endpoint: 'https://your-instance.my.salesforce.com',
|
|
69
|
+
kmsKey: key,
|
|
70
|
+
filters: [
|
|
71
|
+
{
|
|
72
|
+
objectType: SalesforceObjectType.ATTACHMENT,
|
|
73
|
+
includePatterns: ['.*\\.pdf'],
|
|
74
|
+
excludePatterns: ['.*private.*\\.pdf'],
|
|
75
|
+
},
|
|
76
|
+
{
|
|
77
|
+
objectType: SalesforceObjectType.CONTRACT,
|
|
78
|
+
includePatterns: ['.*public.*\\.pdf'],
|
|
79
|
+
excludePatterns: ['.*confidential.*\\.pdf'],
|
|
80
|
+
},
|
|
81
|
+
],
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
kb.addSharePointDataSource({
|
|
85
|
+
dataSourceName: 'SharepointDataSource',
|
|
86
|
+
authSecret: secret,
|
|
87
|
+
kmsKey: key,
|
|
88
|
+
domain: 'yourdomain',
|
|
89
|
+
siteUrls: ['https://yourdomain.sharepoint.com/sites/mysite'],
|
|
90
|
+
tenantId: '888d0b57-69f1-4fb8-957f-e1f0bedf64de',
|
|
91
|
+
filters: [
|
|
92
|
+
{
|
|
93
|
+
objectType: SharePointObjectType.PAGE,
|
|
94
|
+
includePatterns: ['.*\\.pdf'],
|
|
95
|
+
excludePatterns: ['.*private.*\\.pdf'],
|
|
96
|
+
},
|
|
97
|
+
{
|
|
98
|
+
objectType: SharePointObjectType.FILE,
|
|
99
|
+
includePatterns: ['.*public.*\\.pdf'],
|
|
100
|
+
excludePatterns: ['.*confidential.*\\.pdf'],
|
|
101
|
+
},
|
|
102
|
+
],
|
|
103
|
+
});
|
|
104
|
+
|
|
105
|
+
kb.addCustomDataSource({
|
|
106
|
+
dataSourceName: 'CustomDataSource',
|
|
107
|
+
chunkingStrategy: ChunkingStrategy.FIXED_SIZE,
|
|
108
|
+
});
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Python
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
from aws_cdk import (
|
|
115
|
+
Stack,
|
|
116
|
+
aws_s3 as s3,
|
|
117
|
+
aws_lambda as _lambda,
|
|
118
|
+
aws_secretsmanager as secretsmanager,
|
|
119
|
+
aws_kms as kms
|
|
120
|
+
)
|
|
121
|
+
from constructs import Construct
|
|
122
|
+
from cdklabs.generative_ai_cdk_constructs import (
|
|
123
|
+
bedrock
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
class PythonTestStack(Stack):
|
|
127
|
+
|
|
128
|
+
def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
|
|
129
|
+
super().__init__(scope, construct_id, **kwargs)
|
|
130
|
+
|
|
131
|
+
kb = bedrock.VectorKnowledgeBase(self, 'MyKnowledgeBase',
|
|
132
|
+
embeddings_model= bedrock.BedrockFoundationModel.COHERE_EMBED_MULTILINGUAL_V3,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
docBucket = s3.Bucket(self, 'Bucket')
|
|
136
|
+
|
|
137
|
+
function = _lambda.Function(self, 'MyFunction',
|
|
138
|
+
runtime=_lambda.Runtime.PYTHON_3_12,
|
|
139
|
+
handler='index.handler',
|
|
140
|
+
code=_lambda.Code.from_inline('print("Hello, World!")'),
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
kb.add_web_crawler_data_source(
|
|
144
|
+
source_urls= ['https://docs.aws.amazon.com/'],
|
|
145
|
+
chunking_strategy= bedrock.ChunkingStrategy.HIERARCHICAL_COHERE,
|
|
146
|
+
custom_transformation= bedrock.CustomTransformation.lambda_(
|
|
147
|
+
lambda_function= function,
|
|
148
|
+
s3_bucket_uri= f's3://{docBucket.bucket_name}/chunk-processor/'
|
|
149
|
+
)
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
kb.add_s3_data_source(
|
|
153
|
+
bucket= docBucket,
|
|
154
|
+
chunking_strategy= bedrock.ChunkingStrategy.SEMANTIC,
|
|
155
|
+
parsing_strategy= bedrock.ParsingStategy.foundation_model(
|
|
156
|
+
parsing_model= bedrock.BedrockFoundationModel.ANTHROPIC_CLAUDE_SONNET_V1_0.as_i_model(self)
|
|
157
|
+
)
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
secret = secretsmanager.Secret(self, 'Secret')
|
|
161
|
+
key = kms.Key(self, 'Key')
|
|
162
|
+
|
|
163
|
+
kb.add_confluence_data_source(
|
|
164
|
+
data_source_name='TestDataSource',
|
|
165
|
+
auth_secret=secret,
|
|
166
|
+
kms_key=key,
|
|
167
|
+
confluence_url='https://example.atlassian.net',
|
|
168
|
+
filters=[
|
|
169
|
+
bedrock.ConfluenceCrawlingFilters(
|
|
170
|
+
object_type=bedrock.ConfluenceObjectType.ATTACHMENT,
|
|
171
|
+
include_patterns= [".*\\.pdf"],
|
|
172
|
+
exclude_patterns= [".*private.*\\.pdf"],
|
|
173
|
+
),
|
|
174
|
+
bedrock.ConfluenceCrawlingFilters(
|
|
175
|
+
object_type=bedrock.ConfluenceObjectType.PAGE,
|
|
176
|
+
include_patterns= [".*public.*\\.pdf"],
|
|
177
|
+
exclude_patterns= [".*confidential.*\\.pdf"],
|
|
178
|
+
),
|
|
179
|
+
]
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
kb.add_salesforce_data_source(
|
|
183
|
+
auth_secret=secret,
|
|
184
|
+
endpoint='https://your-instance.my.salesforce.com',
|
|
185
|
+
kms_key=key,
|
|
186
|
+
filters=[
|
|
187
|
+
bedrock.SalesforceCrawlingFilters(
|
|
188
|
+
object_type=bedrock.SalesforceObjectType.ATTACHMENT,
|
|
189
|
+
include_patterns= [".*\\.pdf"],
|
|
190
|
+
exclude_patterns= [".*private.*\\.pdf"],
|
|
191
|
+
),
|
|
192
|
+
bedrock.SalesforceCrawlingFilters(
|
|
193
|
+
object_type=bedrock.SalesforceObjectType.CONTRACT,
|
|
194
|
+
include_patterns= [".*public.*\\.pdf"],
|
|
195
|
+
exclude_patterns= [".*confidential.*\\.pdf"],
|
|
196
|
+
),
|
|
197
|
+
]
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
kb.add_share_point_data_source(
|
|
201
|
+
data_source_name='SharepointDataSource',
|
|
202
|
+
auth_secret=secret,
|
|
203
|
+
kms_key=key,
|
|
204
|
+
domain='yourDomain',
|
|
205
|
+
site_urls= ['https://yourdomain.sharepoint.com/sites/mysite'],
|
|
206
|
+
tenant_id='888d0b57-69f1-4fb8-957f-e1f0bedf64de',
|
|
207
|
+
filters=[
|
|
208
|
+
bedrock.SharePointCrawlingFilters(
|
|
209
|
+
object_type=bedrock.SharePointObjectType.PAGE,
|
|
210
|
+
include_patterns= [".*\\.pdf"],
|
|
211
|
+
exclude_patterns= [".*private.*\\.pdf"],
|
|
212
|
+
),
|
|
213
|
+
bedrock.SharePointCrawlingFilters(
|
|
214
|
+
object_type=bedrock.SharePointObjectType.FILE,
|
|
215
|
+
include_patterns= [".*public.*\\.pdf"],
|
|
216
|
+
exclude_patterns= [".*confidential.*\\.pdf"],
|
|
217
|
+
),
|
|
218
|
+
]
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
kb.add_custom_data_source(
|
|
222
|
+
data_source_name='CustomDataSource',
|
|
223
|
+
chunking_strategy=bedrock.ChunkingStrategy.FIXED_SIZE,
|
|
224
|
+
)
|
|
225
|
+
```
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# Kendra Knowledge Base
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
With Amazon Bedrock Knowledge Bases, you can build a knowledge base from an Amazon Kendra GenAI index to create more sophisticated and accurate Retrieval Augmented Generation (RAG)-powered digital assistants. By combining an Amazon Kendra GenAI index with Amazon Bedrock Knowledge Bases, you can:
|
|
6
|
+
|
|
7
|
+
- Reuse your indexed content across multiple Amazon Bedrock applications without rebuilding indexes or re-ingesting data.
|
|
8
|
+
- Leverage the advanced GenAI capabilities of Amazon Bedrock while benefiting from the high-accuracy information retrieval of Amazon Kendra.
|
|
9
|
+
- Customize your digital assistant's behavior using the tools of Amazon Bedrock while maintaining the semantic accuracy of an Amazon Kendra GenAI index.
|
|
10
|
+
|
|
11
|
+
## Kendra Knowledge Base Properties
|
|
12
|
+
|
|
13
|
+
| Name | Type | Required | Description |
|
|
14
|
+
|------|------|----------|-------------|
|
|
15
|
+
| kendraIndex | IKendraGenAiIndex | Yes | The Kendra Index to use for the knowledge base. |
|
|
16
|
+
| name | string | No | The name of the knowledge base. If not provided, a name will be auto-generated. |
|
|
17
|
+
| description | string | No | Description of the knowledge base. |
|
|
18
|
+
| instruction | string | No | Instructions for the knowledge base. |
|
|
19
|
+
| existingRole | iam.IRole | No | An existing IAM role to use for the knowledge base. If not provided, a new role will be created. |
|
|
20
|
+
|
|
21
|
+
## Example
|
|
22
|
+
|
|
23
|
+
### TypeScript
|
|
24
|
+
|
|
25
|
+
```ts
|
|
26
|
+
import * as s3 from 'aws-cdk-lib/aws-s3';
|
|
27
|
+
import { bedrock, kendra } from '@cdklabs/generative-ai-cdk-constructs';
|
|
28
|
+
|
|
29
|
+
const cmk = new kms.Key(stack, 'cmk', {});
|
|
30
|
+
|
|
31
|
+
// you can create a new index using the api below
|
|
32
|
+
const index = new kendra.KendraGenAiIndex(this, 'index', {
|
|
33
|
+
name: 'kendra-index-cdk',
|
|
34
|
+
kmsKey: cmk,
|
|
35
|
+
documentCapacityUnits: 1, // 40K documents
|
|
36
|
+
queryCapacityUnits: 1, // 0.2 QPS
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
// or import an existing one
|
|
40
|
+
const index = kendra.KendraGenAiIndex.fromAttrs(this, 'myindex', {
|
|
41
|
+
indexId: 'myindex',
|
|
42
|
+
role: myRole
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
new bedrock.KendraKnowledgeBase(this, 'kb', {
|
|
46
|
+
name: 'kendra-kb-cdk',
|
|
47
|
+
kendraIndex: index,
|
|
48
|
+
});
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### Python
|
|
52
|
+
|
|
53
|
+
```py
|
|
54
|
+
from aws_cdk import aws_kms as kms
|
|
55
|
+
from cdklabs.generative_ai_cdk_constructs import bedrock, kendra
|
|
56
|
+
|
|
57
|
+
# Create a KMS key
|
|
58
|
+
cmk = kms.Key(stack, 'cmk')
|
|
59
|
+
|
|
60
|
+
# Create a new Kendra index
|
|
61
|
+
index = kendra.KendraGenAiIndex(self, 'index',
|
|
62
|
+
name='kendra-index-cdk',
|
|
63
|
+
kms_key=cmk,
|
|
64
|
+
document_capacity_units=1, # 40K documents
|
|
65
|
+
query_capacity_units=1 # 0.2 QPS
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Or import an existing index
|
|
69
|
+
index = kendra.KendraGenAiIndex.from_attrs(self, 'myindex',
|
|
70
|
+
index_id='myindex',
|
|
71
|
+
role=my_role
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# Create a Kendra Knowledge Base
|
|
75
|
+
kb = bedrock.KendraKnowledgeBase(self, 'kb',
|
|
76
|
+
name='kendra-kb-cdk',
|
|
77
|
+
kendra_index=index
|
|
78
|
+
)
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
[View full documentation](https://github.com/awslabs/generative-ai-cdk-constructs/blob/main/src/cdk-lib/bedrock/README.md)
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# Amazon Bedrock Knowledge Bases
|
|
2
|
+
|
|
3
|
+
Amazon Bedrock Knowledge Bases enable you to provide foundation models and agents with contextual information from your company's private data sources. This enhances the relevance, accuracy, and customization of their responses.
|
|
4
|
+
|
|
5
|
+
## Table of Contents
|
|
6
|
+
|
|
7
|
+
- [Amazon Bedrock Knowledge Bases](#amazon-bedrock-knowledge-bases)
|
|
8
|
+
- [Table of Contents](#table-of-contents)
|
|
9
|
+
- [Key Concepts](#key-concepts)
|
|
10
|
+
- [Knowledge Base Types](#knowledge-base-types)
|
|
11
|
+
- [Knowledge Base Components](#knowledge-base-components)
|
|
12
|
+
- [When to Use Knowledge Bases](#when-to-use-knowledge-bases)
|
|
13
|
+
- [Related Resources](#related-resources)
|
|
14
|
+
- [Quick Start Example](#quick-start-example)
|
|
15
|
+
- [Next Steps](#next-steps)
|
|
16
|
+
|
|
17
|
+
## Key Concepts
|
|
18
|
+
|
|
19
|
+
- **Knowledge Base**: A repository of information that foundation models can access to provide context-aware responses
|
|
20
|
+
- **Vector Store**: A specialized database that stores and retrieves text as vector embeddings
|
|
21
|
+
- **Chunking**: The process of breaking documents into smaller pieces for efficient storage and retrieval
|
|
22
|
+
- **Data Source**: The origin of information ingested into a knowledge base (S3, web crawler, etc.)
|
|
23
|
+
- **Embeddings Model**: A model that converts text into vector representations
|
|
24
|
+
|
|
25
|
+
## Knowledge Base Types
|
|
26
|
+
|
|
27
|
+
- **Vector Knowledge Base**: Uses vector embeddings to find semantically similar content (most common)
|
|
28
|
+
- **Kendra Knowledge Base**: Leverages Amazon Kendra's semantic search capabilities
|
|
29
|
+
- **Structured Data Retrieval**: Enables querying structured data sources (coming soon)
|
|
30
|
+
|
|
31
|
+
> **Note**: Vector Knowledge Base is the most common type and currently has the most comprehensive support in the GenAI CDK. See [Vector Knowledge Base Creation](vector/creation.md) to get started.
|
|
32
|
+
|
|
33
|
+
## Knowledge Base Components
|
|
34
|
+
|
|
35
|
+
```mermaid
|
|
36
|
+
graph TD
|
|
37
|
+
A[Knowledge Base] --> B[Vector Store]
|
|
38
|
+
A --> C[Data Sources]
|
|
39
|
+
A --> D[Embeddings Model]
|
|
40
|
+
B --> E[OpenSearch Serverless]
|
|
41
|
+
B --> F[Aurora PostgreSQL]
|
|
42
|
+
B --> G[Pinecone]
|
|
43
|
+
C --> H[S3]
|
|
44
|
+
C --> I[Web Crawler]
|
|
45
|
+
C --> J[Confluence]
|
|
46
|
+
C --> K[SharePoint]
|
|
47
|
+
C --> L[Salesforce]
|
|
48
|
+
A --> M[Chunking Strategy]
|
|
49
|
+
M --> N[Fixed Size]
|
|
50
|
+
M --> O[Hierarchical]
|
|
51
|
+
M --> P[Semantic]
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## When to Use Knowledge Bases
|
|
55
|
+
|
|
56
|
+
- **Domain-Specific Knowledge**: Provide specialized information not in the model's training data
|
|
57
|
+
- **Private Information**: Allow models to access your organization's proprietary information
|
|
58
|
+
- **Up-to-Date Information**: Supply models with the latest information beyond their training cutoff
|
|
59
|
+
- **RAG Applications**: Implement Retrieval Augmented Generation for more accurate responses
|
|
60
|
+
|
|
61
|
+
## Related Resources
|
|
62
|
+
|
|
63
|
+
- **Chunking Strategies**: `genai-cdk-constructs://bedrock/knowledgebases/chunking`
|
|
64
|
+
- **Data Sources**: `genai-cdk-constructs://bedrock/knowledgebases/datasources`
|
|
65
|
+
- **Kendra Integration**: `genai-cdk-constructs://bedrock/knowledgebases/kendra`
|
|
66
|
+
- **Parsing Strategies**: `genai-cdk-constructs://bedrock/knowledgebases/parsing`
|
|
67
|
+
- **Custom Transformations**: `genai-cdk-constructs://bedrock/knowledgebases/transformation`
|
|
68
|
+
- **Vector Stores**:
|
|
69
|
+
- `genai-cdk-constructs://bedrock/knowledgebases/vector/creation`
|
|
70
|
+
- `genai-cdk-constructs://bedrock/knowledgebases/vector/opensearch`
|
|
71
|
+
- `genai-cdk-constructs://bedrock/knowledgebases/vector/aurora`
|
|
72
|
+
- `genai-cdk-constructs://bedrock/knowledgebases/vector/pinecone`
|
|
73
|
+
|
|
74
|
+
## Quick Start Example
|
|
75
|
+
|
|
76
|
+
```typescript
|
|
77
|
+
import { bedrock } from '@cdklabs/generative-ai-cdk-constructs';
|
|
78
|
+
import * as s3 from 'aws-cdk-lib/aws-s3';
|
|
79
|
+
|
|
80
|
+
// Create a vector knowledge base
|
|
81
|
+
const kb = new bedrock.VectorKnowledgeBase(this, 'KnowledgeBase', {
|
|
82
|
+
// Use TITAN_EMBED_TEXT_V2 with appropriate dimension size based on your needs:
|
|
83
|
+
// - 256: Fastest, lowest storage requirements, good for simple use cases
|
|
84
|
+
// - 512: Balanced performance and accuracy, recommended for most use cases
|
|
85
|
+
// - 1024: Highest accuracy, best for complex semantic relationships
|
|
86
|
+
embeddingsModel: bedrock.BedrockFoundationModel.TITAN_EMBED_TEXT_V2_512,
|
|
87
|
+
instruction: 'Use this knowledge base to answer questions about our company policies.',
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
// Create a secure S3 bucket for data
|
|
91
|
+
const dataBucket = new s3.Bucket(this, 'DataBucket');
|
|
92
|
+
|
|
93
|
+
// Add an S3 data source with chunking strategy
|
|
94
|
+
new bedrock.S3DataSource(this, 'DataSource', {
|
|
95
|
+
bucket: dataBucket,
|
|
96
|
+
knowledgeBase: kb,
|
|
97
|
+
dataSourceName: 'CompanyPolicies',
|
|
98
|
+
chunkingStrategy: bedrock.ChunkingStrategy.fixedSize({
|
|
99
|
+
maxTokens: 300,
|
|
100
|
+
overlapPercentage: 20,
|
|
101
|
+
}),
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
// Note: When choosing an embedding model version:
|
|
105
|
+
// - TITAN_EMBED_TEXT_V2_256: Lower dimension, faster, smaller storage footprint
|
|
106
|
+
// - TITAN_EMBED_TEXT_V2_512: Balanced performance and accuracy (recommended)
|
|
107
|
+
// - TITAN_EMBED_TEXT_V2_1024: Higher dimension, more accurate but more expensive
|
|
108
|
+
|
|
109
|
+
### Next Steps
|
|
110
|
+
|
|
111
|
+
For more detailed information:
|
|
112
|
+
|
|
113
|
+
- [Vector Knowledge Base Creation](vector/creation.md) - Detailed properties and configuration options
|
|
114
|
+
- [Vector Stores](vector/opensearch.md) - Different vector store options (OpenSearch, Aurora, Pinecone)
|
|
115
|
+
- [Chunking Strategies](chunking.md) - Options for chunking your data
|
|
116
|
+
- [Data Sources](datasources.md) - Different data source types and configuration
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Vector Knowledge Base - Parsing Strategy
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
A parsing strategy in Amazon Bedrock is a configuration that determines how the service
|
|
6
|
+
processes and interprets the contents of a document. It involves converting the document's
|
|
7
|
+
contents into text and splitting it into smaller chunks for analysis. Amazon Bedrock offers
|
|
8
|
+
two parsing strategies:
|
|
9
|
+
|
|
10
|
+
### Default Parsing Strategy
|
|
11
|
+
|
|
12
|
+
This strategy converts the document's contents into text
|
|
13
|
+
and splits it into chunks using a predefined approach. It is suitable for most use cases
|
|
14
|
+
but may not be optimal for specific document types or requirements.
|
|
15
|
+
|
|
16
|
+
### Foundation Model Parsing Strategy
|
|
17
|
+
|
|
18
|
+
This strategy uses a foundation model to describe
|
|
19
|
+
the contents of the document. It is particularly useful for improved processing of PDF files
|
|
20
|
+
with tables and images. To use this strategy, set the `parsingStrategy` in a data source as below.
|
|
21
|
+
|
|
22
|
+
#### TypeScript
|
|
23
|
+
|
|
24
|
+
```ts
|
|
25
|
+
bedrock.ParsingStategy.foundationModel({
|
|
26
|
+
model: BedrockFoundationModel.ANTHROPIC_CLAUDE_SONNET_V1_0,
|
|
27
|
+
});
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
#### Python
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
bedrock.ParsingStategy.foundation_model(
|
|
34
|
+
parsing_model=BedrockFoundationModel.ANTHROPIC_CLAUDE_SONNET_V1_0
|
|
35
|
+
)
|
|
36
|
+
```
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Knowledge Base - Custom Transformation
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
Custom Transformation in Amazon Bedrock is a feature that allows you to create and apply
|
|
6
|
+
custom processing steps to documents moving through a data source ingestion pipeline.
|
|
7
|
+
|
|
8
|
+
Custom Transformation uses AWS Lambda functions to process documents, enabling you to
|
|
9
|
+
perform custom operations such as data extraction, normalization, or enrichment. To
|
|
10
|
+
create a custom transformation, set the `customTransformation` in a data source as below.
|
|
11
|
+
|
|
12
|
+
## Example
|
|
13
|
+
|
|
14
|
+
### TypeScript
|
|
15
|
+
|
|
16
|
+
```ts
|
|
17
|
+
CustomTransformation.lambda({
|
|
18
|
+
lambdaFunction: lambdaFunction,
|
|
19
|
+
s3BucketUri: `s3://${bucket.bucketName}/chunk-processor/`,
|
|
20
|
+
}),
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
### Python
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
CustomTransformation.lambda_(
|
|
27
|
+
lambda_function= function,
|
|
28
|
+
s3_bucket_uri= f's3://{docBucket.bucket_name}/chunk-processor/'
|
|
29
|
+
)
|
|
30
|
+
```
|