agent-starter-pack 0.1.7__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of agent-starter-pack might be problematic. Click here for more details.
- {agent_starter_pack-0.1.7.dist-info → agent_starter_pack-0.2.1.dist-info}/METADATA +7 -6
- {agent_starter_pack-0.1.7.dist-info → agent_starter_pack-0.2.1.dist-info}/RECORD +77 -77
- agents/{agentic_rag_vertexai_search → agentic_rag}/README.md +3 -3
- agents/{agentic_rag_vertexai_search → agentic_rag}/app/agent.py +22 -6
- agents/agentic_rag/app/retrievers.py +132 -0
- agents/{agentic_rag_vertexai_search → agentic_rag}/notebooks/evaluating_langgraph_agent.ipynb +3 -3
- agents/{agentic_rag_vertexai_search → agentic_rag}/template/.templateconfig.yaml +3 -5
- agents/crewai_coding_crew/notebooks/evaluating_crewai_agent.ipynb +4 -4
- agents/crewai_coding_crew/notebooks/evaluating_langgraph_agent.ipynb +3 -3
- agents/langgraph_base_react/notebooks/evaluating_langgraph_agent.ipynb +3 -3
- agents/{multimodal_live_api → live_api}/README.md +7 -0
- agents/{multimodal_live_api → live_api}/app/agent.py +3 -11
- agents/{multimodal_live_api → live_api}/app/server.py +3 -2
- agents/{multimodal_live_api → live_api}/template/.templateconfig.yaml +2 -2
- src/base_template/Makefile +12 -7
- src/base_template/README.md +71 -71
- src/base_template/app/utils/tracing.py +3 -1
- src/base_template/app/utils/typing.py +1 -0
- src/base_template/deployment/cd/deploy-to-prod.yaml +10 -4
- src/base_template/deployment/cd/staging.yaml +11 -10
- src/base_template/deployment/ci/pr_checks.yaml +1 -1
- src/base_template/deployment/terraform/apis.tf +6 -0
- src/base_template/deployment/terraform/build_triggers.tf +34 -21
- src/base_template/deployment/terraform/dev/iam.tf +13 -6
- src/base_template/deployment/terraform/dev/log_sinks.tf +25 -28
- src/base_template/deployment/terraform/dev/providers.tf +1 -0
- src/base_template/deployment/terraform/dev/storage.tf +69 -11
- src/base_template/deployment/terraform/dev/variables.tf +50 -53
- src/base_template/deployment/terraform/dev/vars/env.tfvars +13 -11
- src/base_template/deployment/terraform/iam.tf +3 -3
- src/base_template/deployment/terraform/log_sinks.tf +24 -26
- src/base_template/deployment/terraform/providers.tf +2 -0
- src/base_template/deployment/terraform/service_accounts.tf +7 -7
- src/base_template/deployment/terraform/storage.tf +123 -11
- src/base_template/deployment/terraform/variables.tf +49 -70
- src/base_template/deployment/terraform/vars/env.tfvars +12 -17
- src/base_template/pyproject.toml +4 -3
- src/cli/commands/create.py +79 -19
- src/cli/commands/setup_cicd.py +91 -22
- src/cli/main.py +3 -1
- src/cli/utils/__init__.py +9 -2
- src/cli/utils/cicd.py +12 -0
- src/cli/utils/datastores.py +32 -0
- src/cli/utils/gcp.py +4 -6
- src/cli/utils/template.py +127 -45
- src/cli/utils/version.py +87 -0
- src/data_ingestion/README.md +24 -19
- src/data_ingestion/data_ingestion_pipeline/components/ingest_data.py +135 -2
- src/data_ingestion/data_ingestion_pipeline/components/process_data.py +276 -2
- src/data_ingestion/data_ingestion_pipeline/pipeline.py +28 -5
- src/data_ingestion/data_ingestion_pipeline/submit_pipeline.py +49 -14
- src/data_ingestion/pyproject.toml +1 -0
- src/deployment_targets/agent_engine/app/agent_engine_app.py +3 -1
- src/deployment_targets/cloud_run/tests/unit/test_server.py +15 -33
- src/frontends/live_api_react/frontend/package-lock.json +208 -168
- src/frontends/live_api_react/frontend/package.json +1 -1
- src/resources/containers/data_processing/Dockerfile +3 -1
- src/resources/locks/{uv-agentic_rag_vertexai_search-agent_engine.lock → uv-agentic_rag-agent_engine.lock} +747 -694
- src/resources/locks/{uv-agentic_rag_vertexai_search-cloud_run.lock → uv-agentic_rag-cloud_run.lock} +944 -806
- src/resources/locks/uv-crewai_coding_crew-agent_engine.lock +651 -694
- src/resources/locks/uv-crewai_coding_crew-cloud_run.lock +813 -789
- src/resources/locks/uv-langgraph_base_react-agent_engine.lock +666 -686
- src/resources/locks/uv-langgraph_base_react-cloud_run.lock +848 -798
- src/resources/locks/{uv-multimodal_live_api-cloud_run.lock → uv-live_api-cloud_run.lock} +856 -791
- src/resources/setup_cicd/cicd_variables.tf +5 -0
- src/resources/setup_cicd/github.tf +4 -2
- src/utils/watch_and_rebuild.py +14 -0
- agents/agentic_rag_vertexai_search/app/retrievers.py +0 -79
- src/deployment_targets/cloud_run/deployment/terraform/artifact_registry.tf +0 -22
- src/deployment_targets/cloud_run/deployment/terraform/dev/service_accounts.tf +0 -20
- {agent_starter_pack-0.1.7.dist-info → agent_starter_pack-0.2.1.dist-info}/WHEEL +0 -0
- {agent_starter_pack-0.1.7.dist-info → agent_starter_pack-0.2.1.dist-info}/entry_points.txt +0 -0
- {agent_starter_pack-0.1.7.dist-info → agent_starter_pack-0.2.1.dist-info}/licenses/LICENSE +0 -0
- /agents/{agentic_rag_vertexai_search → agentic_rag}/app/templates.py +0 -0
- /agents/{agentic_rag_vertexai_search → agentic_rag}/tests/integration/test_agent.py +0 -0
- /agents/{multimodal_live_api → live_api}/app/templates.py +0 -0
- /agents/{multimodal_live_api → live_api}/app/vector_store.py +0 -0
- /agents/{multimodal_live_api → live_api}/tests/integration/test_server_e2e.py +0 -0
- /agents/{multimodal_live_api → live_api}/tests/load_test/load_test.py +0 -0
- /agents/{multimodal_live_api → live_api}/tests/unit/test_server.py +0 -0
src/cli/utils/version.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# Copyright 2025 Google LLC
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""Version checking utilities for the CLI."""
|
|
16
|
+
|
|
17
|
+
import logging
|
|
18
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
19
|
+
|
|
20
|
+
import requests
|
|
21
|
+
from packaging import version as pkg_version
|
|
22
|
+
from rich.console import Console
|
|
23
|
+
|
|
24
|
+
console = Console()
|
|
25
|
+
|
|
26
|
+
PACKAGE_NAME = "agent-starter-pack"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_current_version() -> str:
|
|
30
|
+
"""Get the current installed version of the package."""
|
|
31
|
+
try:
|
|
32
|
+
return version(PACKAGE_NAME)
|
|
33
|
+
except PackageNotFoundError:
|
|
34
|
+
# For development environments where package isn't installed
|
|
35
|
+
return "0.0.0" # Default if version can't be determined
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def get_latest_version() -> str:
|
|
39
|
+
"""Get the latest version available on PyPI."""
|
|
40
|
+
try:
|
|
41
|
+
response = requests.get(f"https://pypi.org/pypi/{PACKAGE_NAME}/json", timeout=2)
|
|
42
|
+
if response.status_code == 200:
|
|
43
|
+
return response.json()["info"]["version"]
|
|
44
|
+
return "0.0.0"
|
|
45
|
+
except Exception:
|
|
46
|
+
return "0.0.0" # Default if PyPI can't be reached
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def check_for_updates() -> tuple[bool, str, str]:
|
|
50
|
+
"""Check if a newer version of the package is available.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
Tuple of (needs_update, current_version, latest_version)
|
|
54
|
+
"""
|
|
55
|
+
current = get_current_version()
|
|
56
|
+
latest = get_latest_version()
|
|
57
|
+
|
|
58
|
+
needs_update = pkg_version.parse(latest) > pkg_version.parse(current)
|
|
59
|
+
|
|
60
|
+
return needs_update, current, latest
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def display_update_message() -> None:
|
|
64
|
+
"""Check for updates and display a message if an update is available."""
|
|
65
|
+
try:
|
|
66
|
+
needs_update, current, latest = check_for_updates()
|
|
67
|
+
|
|
68
|
+
if needs_update:
|
|
69
|
+
console.print(
|
|
70
|
+
f"\n[yellow]⚠️ Update available: {current} → {latest}[/]",
|
|
71
|
+
highlight=False,
|
|
72
|
+
)
|
|
73
|
+
console.print(
|
|
74
|
+
f"[yellow]Run `pip install --upgrade {PACKAGE_NAME}` to update.",
|
|
75
|
+
highlight=False,
|
|
76
|
+
)
|
|
77
|
+
console.print(
|
|
78
|
+
f"[yellow]Or, if you used pipx: `pipx upgrade {PACKAGE_NAME}`",
|
|
79
|
+
highlight=False,
|
|
80
|
+
)
|
|
81
|
+
console.print(
|
|
82
|
+
f"[yellow]Or, if you used uv: `uv pip install --upgrade {PACKAGE_NAME}`",
|
|
83
|
+
highlight=False,
|
|
84
|
+
)
|
|
85
|
+
except Exception as e:
|
|
86
|
+
# Don't let version checking errors affect the CLI
|
|
87
|
+
logging.debug(f"Error checking for updates: {e}")
|
src/data_ingestion/README.md
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# Data Ingestion Pipeline
|
|
2
2
|
|
|
3
|
-
This pipeline automates the ingestion of data into Vertex AI Search, streamlining the process of building Retrieval Augmented Generation (RAG) applications.
|
|
3
|
+
This pipeline automates the ingestion of data into Vertex AI{%- if cookiecutter.datastore_type == "vertex_ai_vector_search" %} Vector{%- endif %} Search, streamlining the process of building Retrieval Augmented Generation (RAG) applications.
|
|
4
4
|
|
|
5
|
-
It orchestrates the complete workflow: loading data, chunking it into manageable segments, generating embeddings using Vertex AI Embeddings, and importing the processed data into your Vertex AI Search datastore.
|
|
5
|
+
It orchestrates the complete workflow: loading data, chunking it into manageable segments, generating embeddings using Vertex AI Embeddings, and importing the processed data into your Vertex AI{%- if cookiecutter.datastore_type == "vertex_ai_vector_search" %} Vector{%- endif %} Search datastore.
|
|
6
6
|
|
|
7
7
|
You can trigger the pipeline for an initial data load or schedule it to run periodically, ensuring your search index remains current. Vertex AI Pipelines provides the orchestration and monitoring capabilities for this process.
|
|
8
8
|
|
|
@@ -37,7 +37,7 @@ uv sync --frozen
|
|
|
37
37
|
**c. Execute the Pipeline:**
|
|
38
38
|
|
|
39
39
|
Run the following command to execute the data ingestion pipeline. Replace the placeholder values with your actual project details.
|
|
40
|
-
|
|
40
|
+
{%- if cookiecutter.datastore_type == "vertex_ai_search" %}
|
|
41
41
|
```bash
|
|
42
42
|
PROJECT_ID="YOUR_PROJECT_ID"
|
|
43
43
|
REGION="us-central1"
|
|
@@ -47,22 +47,26 @@ uv run data_ingestion_pipeline/submit_pipeline.py \
|
|
|
47
47
|
--region=$REGION \
|
|
48
48
|
--data-store-region=$DATA_STORE_REGION \
|
|
49
49
|
--data-store-id="sample-datastore" \
|
|
50
|
-
--service-account="
|
|
51
|
-
--pipeline-root="gs://$PROJECT_ID-
|
|
50
|
+
--service-account="{{cookiecutter.project_name}}-rag@$PROJECT_ID.iam.gserviceaccount.com" \
|
|
51
|
+
--pipeline-root="gs://$PROJECT_ID-{{cookiecutter.project_name}}-rag" \
|
|
52
52
|
--pipeline-name="data-ingestion-pipeline"
|
|
53
53
|
```
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
54
|
+
{%- elif cookiecutter.datastore_type == "vertex_ai_vector_search" %}
|
|
55
|
+
```bash
|
|
56
|
+
PROJECT_ID="YOUR_PROJECT_ID"
|
|
57
|
+
REGION="us-central1"
|
|
58
|
+
VECTOR_SEARCH_INDEX="YOUR_VECTOR_SEARCH_INDEX"
|
|
59
|
+
VECTOR_SEARCH_INDEX_ENDPOINT="YOUR_VECTOR_SEARCH_INDEX_ENDPOINT"
|
|
60
|
+
uv run data_ingestion_pipeline/submit_pipeline.py \
|
|
61
|
+
--project-id=$PROJECT_ID \
|
|
62
|
+
--region=$REGION \
|
|
63
|
+
--vector-search-index=$VECTOR_SEARCH_INDEX \
|
|
64
|
+
--vector-search-index-endpoint=$VECTOR_SEARCH_INDEX_ENDPOINT \
|
|
65
|
+
--service-account="{{cookiecutter.project_name}}-rag@$PROJECT_ID.iam.gserviceaccount.com" \
|
|
66
|
+
--pipeline-root="gs://$PROJECT_ID-{{cookiecutter.project_name}}-rag" \
|
|
67
|
+
--pipeline-name="data-ingestion-pipeline"
|
|
68
|
+
```
|
|
69
|
+
{%- endif %}
|
|
66
70
|
|
|
67
71
|
**d. Pipeline Scheduling and Execution:**
|
|
68
72
|
|
|
@@ -74,6 +78,7 @@ The pipeline's configuration and execution status will be printed to the console
|
|
|
74
78
|
|
|
75
79
|
## Testing Your RAG Application
|
|
76
80
|
|
|
77
|
-
Once the data ingestion pipeline completes successfully, you can test your RAG application with Vertex AI Search.
|
|
78
|
-
|
|
81
|
+
Once the data ingestion pipeline completes successfully, you can test your RAG application with Vertex AI{%- if cookiecutter.datastore_type == "vertex_ai_vector_search" %} Vector{%- endif %} Search.
|
|
82
|
+
{%- if cookiecutter.datastore_type == "vertex_ai_search" %}
|
|
79
83
|
> **Troubleshooting:** If you encounter the error `"google.api_core.exceptions.InvalidArgument: 400 The embedding field path: embedding not found in schema"` after the initial data ingestion, wait a few minutes and try again. This delay allows Vertex AI Search to fully index the ingested data.
|
|
84
|
+
{%- endif %}
|
|
@@ -11,12 +11,13 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
# ruff: noqa
|
|
14
15
|
|
|
15
16
|
from kfp.dsl import Dataset, Input, component
|
|
16
|
-
|
|
17
|
+
{% if cookiecutter.datastore_type == "vertex_ai_search" %}
|
|
17
18
|
|
|
18
19
|
@component(
|
|
19
|
-
base_image="us-docker.pkg.dev/production-ai-template/starter-pack/data_processing:0.
|
|
20
|
+
base_image="us-docker.pkg.dev/production-ai-template/starter-pack/data_processing:0.2"
|
|
20
21
|
)
|
|
21
22
|
def ingest_data(
|
|
22
23
|
project_id: str,
|
|
@@ -37,6 +38,7 @@ def ingest_data(
|
|
|
37
38
|
"""
|
|
38
39
|
import json
|
|
39
40
|
import logging
|
|
41
|
+
import time
|
|
40
42
|
|
|
41
43
|
from google.api_core.client_options import ClientOptions
|
|
42
44
|
from google.cloud import discoveryengine
|
|
@@ -173,3 +175,134 @@ def ingest_data(
|
|
|
173
175
|
input_files_uri=input_files.uri,
|
|
174
176
|
)
|
|
175
177
|
logging.info("Data import completed")
|
|
178
|
+
logging.info(
|
|
179
|
+
"Sleeping for 3 minutes to allow Vertex AI Search to properly index the data..."
|
|
180
|
+
)
|
|
181
|
+
time.sleep(180) # Sleep for 180 seconds (3 minutes)
|
|
182
|
+
logging.info("Sleep completed. Data indexing should now be complete.")
|
|
183
|
+
{% elif cookiecutter.datastore_type == "vertex_ai_vector_search" %}
|
|
184
|
+
from google_cloud_pipeline_components.types.artifact_types import BQTable
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
@component(
|
|
188
|
+
base_image="us-docker.pkg.dev/production-ai-template/starter-pack/data_processing:0.2"
|
|
189
|
+
)
|
|
190
|
+
def ingest_data(
|
|
191
|
+
project_id: str,
|
|
192
|
+
location: str,
|
|
193
|
+
vector_search_index: str,
|
|
194
|
+
vector_search_index_endpoint: str,
|
|
195
|
+
vector_search_data_bucket_name: str,
|
|
196
|
+
schedule_time: str,
|
|
197
|
+
ingestion_batch_size: int,
|
|
198
|
+
input_table: Input[BQTable],
|
|
199
|
+
is_incremental: bool = True,
|
|
200
|
+
look_back_days: int = 1,
|
|
201
|
+
) -> None:
|
|
202
|
+
"""Process and ingest documents into Vertex AI Vector Search.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
project_id: Google Cloud project ID
|
|
206
|
+
"""
|
|
207
|
+
import logging
|
|
208
|
+
from datetime import datetime, timedelta
|
|
209
|
+
|
|
210
|
+
import bigframes.pandas as bpd
|
|
211
|
+
from google.cloud import aiplatform
|
|
212
|
+
from langchain_google_vertexai import VectorSearchVectorStore
|
|
213
|
+
from langchain_google_vertexai import VertexAIEmbeddings
|
|
214
|
+
|
|
215
|
+
# Initialize logging
|
|
216
|
+
logging.basicConfig(level=logging.INFO)
|
|
217
|
+
|
|
218
|
+
# Initialize clients
|
|
219
|
+
logging.info("Initializing clients...")
|
|
220
|
+
bpd.options.bigquery.project = project_id
|
|
221
|
+
bpd.options.bigquery.location = location
|
|
222
|
+
logging.info("Clients initialized.")
|
|
223
|
+
|
|
224
|
+
# Set date range for data fetch
|
|
225
|
+
schedule_time_dt: datetime = datetime.fromisoformat(
|
|
226
|
+
schedule_time.replace("Z", "+00:00")
|
|
227
|
+
)
|
|
228
|
+
if schedule_time_dt.year == 1970:
|
|
229
|
+
logging.warning(
|
|
230
|
+
"Pipeline schedule not set. Setting schedule_time to current date."
|
|
231
|
+
)
|
|
232
|
+
schedule_time_dt = datetime.now()
|
|
233
|
+
|
|
234
|
+
# Note: The following line sets the schedule time 5 years back to allow sample data to be present.
|
|
235
|
+
# For your use case, please comment out the following line to use the actual schedule time.
|
|
236
|
+
schedule_time_dt = schedule_time_dt - timedelta(days=5 * 365)
|
|
237
|
+
|
|
238
|
+
START_DATE: datetime = schedule_time_dt - timedelta(
|
|
239
|
+
days=look_back_days
|
|
240
|
+
) # Start date for data processing window
|
|
241
|
+
END_DATE: datetime = schedule_time_dt # End date for data processing window
|
|
242
|
+
|
|
243
|
+
logging.info(f"Date range set: START_DATE={START_DATE}, END_DATE={END_DATE}")
|
|
244
|
+
|
|
245
|
+
dataset = input_table.metadata["datasetId"]
|
|
246
|
+
table = input_table.metadata["tableId"]
|
|
247
|
+
|
|
248
|
+
query = f"""
|
|
249
|
+
SELECT
|
|
250
|
+
question_id
|
|
251
|
+
, last_edit_date
|
|
252
|
+
, full_text_md
|
|
253
|
+
, text_chunk
|
|
254
|
+
, chunk_id
|
|
255
|
+
, embedding
|
|
256
|
+
FROM {project_id}.{dataset}.{table}
|
|
257
|
+
WHERE TRUE
|
|
258
|
+
{f'AND DATETIME(creation_timestamp) BETWEEN DATETIME("{START_DATE}") AND DATETIME("{END_DATE}")' if is_incremental else ""}
|
|
259
|
+
"""
|
|
260
|
+
df = (
|
|
261
|
+
bpd.read_gbq(query)
|
|
262
|
+
.sort_values("last_edit_date", ascending=False)
|
|
263
|
+
.drop_duplicates("question_id")
|
|
264
|
+
.reset_index(drop=True)
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
aiplatform.init(
|
|
268
|
+
project=project_id,
|
|
269
|
+
location=location,
|
|
270
|
+
staging_bucket=vector_search_data_bucket_name,
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
embedding_model = VertexAIEmbeddings(model_name="text-embedding-005")
|
|
274
|
+
my_index = aiplatform.MatchingEngineIndex(vector_search_index)
|
|
275
|
+
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint(
|
|
276
|
+
vector_search_index_endpoint
|
|
277
|
+
)
|
|
278
|
+
vector_store = VectorSearchVectorStore.from_components(
|
|
279
|
+
project_id=project_id,
|
|
280
|
+
region=location,
|
|
281
|
+
gcs_bucket_name=vector_search_data_bucket_name.replace("gs://", ""),
|
|
282
|
+
index_id=my_index.name,
|
|
283
|
+
endpoint_id=my_index_endpoint.name,
|
|
284
|
+
embedding=embedding_model,
|
|
285
|
+
stream_update=True,
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
for batch_num, start in enumerate(range(0, len(df), ingestion_batch_size)):
|
|
289
|
+
ids = (
|
|
290
|
+
df.iloc[start : start + ingestion_batch_size]
|
|
291
|
+
.question_id.astype(str)
|
|
292
|
+
.tolist()
|
|
293
|
+
)
|
|
294
|
+
texts = df.iloc[start : start + ingestion_batch_size].text_chunk.tolist()
|
|
295
|
+
embeddings = df.iloc[start : start + ingestion_batch_size].embedding.tolist()
|
|
296
|
+
metadatas = (
|
|
297
|
+
df.iloc[start : start + ingestion_batch_size]
|
|
298
|
+
.drop(columns=["embedding", "last_edit_date"])
|
|
299
|
+
.to_dict(orient="records")
|
|
300
|
+
)
|
|
301
|
+
vector_store.add_texts_with_embeddings(
|
|
302
|
+
ids=ids,
|
|
303
|
+
texts=texts,
|
|
304
|
+
embeddings=embeddings,
|
|
305
|
+
metadatas=metadatas,
|
|
306
|
+
is_complete_overwrite=True,
|
|
307
|
+
)
|
|
308
|
+
{% endif %}
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
# ruff: noqa
|
|
14
15
|
|
|
15
16
|
"""
|
|
16
17
|
This component is derived from the notebook:
|
|
@@ -21,9 +22,9 @@ It leverages BigQuery for data processing. We also suggest looking at remote fun
|
|
|
21
22
|
|
|
22
23
|
from kfp.dsl import Dataset, Output, component
|
|
23
24
|
|
|
24
|
-
|
|
25
|
+
{% if cookiecutter.datastore_type == "vertex_ai_search" %}
|
|
25
26
|
@component(
|
|
26
|
-
base_image="us-docker.pkg.dev/production-ai-template/starter-pack/data_processing:0.
|
|
27
|
+
base_image="us-docker.pkg.dev/production-ai-template/starter-pack/data_processing:0.2"
|
|
27
28
|
)
|
|
28
29
|
def process_data(
|
|
29
30
|
project_id: str,
|
|
@@ -319,3 +320,276 @@ def process_data(
|
|
|
319
320
|
)
|
|
320
321
|
extract_job.result()
|
|
321
322
|
logging.info("Exported to JSONL.")
|
|
323
|
+
{% elif cookiecutter.datastore_type == "vertex_ai_vector_search" %}
|
|
324
|
+
from google_cloud_pipeline_components.types.artifact_types import BQTable
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
@component(
|
|
328
|
+
base_image="us-docker.pkg.dev/production-ai-template/starter-pack/data_processing:0.2",
|
|
329
|
+
)
|
|
330
|
+
def process_data(
|
|
331
|
+
project_id: str,
|
|
332
|
+
schedule_time: str,
|
|
333
|
+
output_table: Output[BQTable],
|
|
334
|
+
is_incremental: bool = True,
|
|
335
|
+
look_back_days: int = 1,
|
|
336
|
+
chunk_size: int = 1500,
|
|
337
|
+
chunk_overlap: int = 20,
|
|
338
|
+
destination_dataset: str = "stackoverflow_data",
|
|
339
|
+
destination_table: str = "incremental_questions_embeddings",
|
|
340
|
+
deduped_table: str = "questions_embeddings",
|
|
341
|
+
location: str = "us-central1",
|
|
342
|
+
) -> None:
|
|
343
|
+
"""Process StackOverflow questions and answers by:
|
|
344
|
+
1. Fetching data from BigQuery
|
|
345
|
+
2. Converting HTML to markdown
|
|
346
|
+
3. Splitting text into chunks
|
|
347
|
+
4. Generating embeddings
|
|
348
|
+
5. Storing results in BigQuery
|
|
349
|
+
6. Exporting to JSONL
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
output_files: Output dataset path
|
|
353
|
+
is_incremental: Whether to process only recent data
|
|
354
|
+
look_back_days: Number of days to look back for incremental processing
|
|
355
|
+
chunk_size: Size of text chunks
|
|
356
|
+
chunk_overlap: Overlap between chunks
|
|
357
|
+
destination_dataset: BigQuery dataset for storing results
|
|
358
|
+
destination_table: Table for storing incremental results
|
|
359
|
+
deduped_table: Table for storing deduplicated results
|
|
360
|
+
location: BigQuery location
|
|
361
|
+
"""
|
|
362
|
+
import logging
|
|
363
|
+
from datetime import datetime, timedelta
|
|
364
|
+
|
|
365
|
+
import backoff
|
|
366
|
+
import bigframes.ml.llm as llm
|
|
367
|
+
import bigframes.pandas as bpd
|
|
368
|
+
import google.api_core.exceptions
|
|
369
|
+
import swifter
|
|
370
|
+
from google.cloud import bigquery
|
|
371
|
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
372
|
+
from markdownify import markdownify
|
|
373
|
+
|
|
374
|
+
# Initialize logging
|
|
375
|
+
logging.basicConfig(level=logging.INFO)
|
|
376
|
+
logging.info(f"Using {swifter} for apply operations.")
|
|
377
|
+
|
|
378
|
+
# Initialize clients
|
|
379
|
+
logging.info("Initializing clients...")
|
|
380
|
+
bq_client = bigquery.Client(project=project_id, location=location)
|
|
381
|
+
bpd.options.bigquery.project = project_id
|
|
382
|
+
bpd.options.bigquery.location = location
|
|
383
|
+
logging.info("Clients initialized.")
|
|
384
|
+
|
|
385
|
+
# Set date range for data fetch
|
|
386
|
+
schedule_time_dt: datetime = datetime.fromisoformat(
|
|
387
|
+
schedule_time.replace("Z", "+00:00")
|
|
388
|
+
)
|
|
389
|
+
if schedule_time_dt.year == 1970:
|
|
390
|
+
logging.warning(
|
|
391
|
+
"Pipeline schedule not set. Setting schedule_time to current date."
|
|
392
|
+
)
|
|
393
|
+
schedule_time_dt = datetime.now()
|
|
394
|
+
|
|
395
|
+
# Note: The following line sets the schedule time 5 years back to allow sample data to be present.
|
|
396
|
+
# For your use case, please comment out the following line to use the actual schedule time.
|
|
397
|
+
schedule_time_dt = schedule_time_dt - timedelta(days=5 * 365)
|
|
398
|
+
|
|
399
|
+
START_DATE: datetime = schedule_time_dt - timedelta(
|
|
400
|
+
days=look_back_days
|
|
401
|
+
) # Start date for data processing window
|
|
402
|
+
END_DATE: datetime = schedule_time_dt # End date for data processing window
|
|
403
|
+
|
|
404
|
+
logging.info(f"Date range set: START_DATE={START_DATE}, END_DATE={END_DATE}")
|
|
405
|
+
|
|
406
|
+
def fetch_stackoverflow_data(
|
|
407
|
+
dataset_suffix: str, start_date: str, end_date: str
|
|
408
|
+
) -> bpd.DataFrame:
|
|
409
|
+
"""Fetch StackOverflow data from BigQuery."""
|
|
410
|
+
query = f"""
|
|
411
|
+
SELECT
|
|
412
|
+
creation_date,
|
|
413
|
+
last_edit_date,
|
|
414
|
+
question_id,
|
|
415
|
+
question_title,
|
|
416
|
+
question_body AS question_text,
|
|
417
|
+
answers
|
|
418
|
+
FROM `production-ai-template.stackoverflow_qa_{dataset_suffix}.stackoverflow_python_questions_and_answers`
|
|
419
|
+
WHERE TRUE
|
|
420
|
+
{f'AND TIMESTAMP_TRUNC(creation_date, DAY) BETWEEN TIMESTAMP("{start_date}") AND TIMESTAMP("{end_date}")' if is_incremental else ""}
|
|
421
|
+
"""
|
|
422
|
+
logging.info("Fetching StackOverflow data from BigQuery...")
|
|
423
|
+
return bpd.read_gbq(query)
|
|
424
|
+
|
|
425
|
+
def convert_html_to_markdown(html: str) -> str:
|
|
426
|
+
"""Convert HTML into Markdown for easier parsing and rendering after LLM response."""
|
|
427
|
+
return markdownify(html).strip()
|
|
428
|
+
|
|
429
|
+
def create_answers_markdown(answers: list) -> str:
|
|
430
|
+
"""Convert each answer's HTML to markdown and concatenate into a single markdown text."""
|
|
431
|
+
answers_md = ""
|
|
432
|
+
for index, answer_record in enumerate(answers):
|
|
433
|
+
answers_md += (
|
|
434
|
+
f"\n\n## Answer {index + 1}:\n" # Answer number is H2 heading size
|
|
435
|
+
)
|
|
436
|
+
answers_md += convert_html_to_markdown(answer_record["body"])
|
|
437
|
+
return answers_md
|
|
438
|
+
|
|
439
|
+
def create_table_if_not_exist(
|
|
440
|
+
df: bpd.DataFrame,
|
|
441
|
+
project_id: str,
|
|
442
|
+
dataset_id: str,
|
|
443
|
+
table_id: str,
|
|
444
|
+
partition_column: str,
|
|
445
|
+
location: str = location,
|
|
446
|
+
) -> None:
|
|
447
|
+
"""Create BigQuery table with time partitioning if it doesn't exist."""
|
|
448
|
+
table_schema = bq_client.get_table(df.head(0).to_gbq()).schema
|
|
449
|
+
table = bigquery.Table(
|
|
450
|
+
f"{project_id}.{dataset_id}.{table_id}", schema=table_schema
|
|
451
|
+
)
|
|
452
|
+
table.time_partitioning = bigquery.TimePartitioning(
|
|
453
|
+
type_=bigquery.TimePartitioningType.DAY, field=partition_column
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
dataset = bigquery.Dataset(f"{project_id}.{dataset_id}")
|
|
457
|
+
dataset.location = location
|
|
458
|
+
bq_client.create_dataset(dataset, exists_ok=True)
|
|
459
|
+
bq_client.create_table(table=table, exists_ok=True)
|
|
460
|
+
|
|
461
|
+
# Fetch and preprocess data
|
|
462
|
+
logging.info("Fetching and preprocessing data...")
|
|
463
|
+
df = fetch_stackoverflow_data(
|
|
464
|
+
start_date=START_DATE.strftime("%Y-%m-%d"),
|
|
465
|
+
end_date=END_DATE.strftime("%Y-%m-%d"),
|
|
466
|
+
dataset_suffix=location.lower().replace("-", "_"),
|
|
467
|
+
)
|
|
468
|
+
df = (
|
|
469
|
+
df.sort_values("last_edit_date", ascending=False)
|
|
470
|
+
.drop_duplicates("question_id")
|
|
471
|
+
.reset_index(drop=True)
|
|
472
|
+
)
|
|
473
|
+
logging.info("Data fetched and preprocessed.")
|
|
474
|
+
|
|
475
|
+
# Convert content to markdown
|
|
476
|
+
logging.info("Converting content to markdown...")
|
|
477
|
+
|
|
478
|
+
# Create markdown fields efficiently
|
|
479
|
+
df["question_title_md"] = (
|
|
480
|
+
"# " + df["question_title"] + "\n"
|
|
481
|
+
) # Title is H1 heading size
|
|
482
|
+
df["question_text_md"] = (
|
|
483
|
+
df["question_text"].to_pandas().swifter.apply(convert_html_to_markdown) + "\n"
|
|
484
|
+
)
|
|
485
|
+
df["answers_md"] = df["answers"].to_pandas().swifter.apply(create_answers_markdown)
|
|
486
|
+
|
|
487
|
+
# Create a column containing the whole markdown text
|
|
488
|
+
df["full_text_md"] = (
|
|
489
|
+
df["question_title_md"] + df["question_text_md"] + df["answers_md"]
|
|
490
|
+
)
|
|
491
|
+
logging.info("Content converted to markdown.")
|
|
492
|
+
|
|
493
|
+
# Keep only necessary columns
|
|
494
|
+
df = df[["last_edit_date", "question_id", "question_text", "full_text_md"]]
|
|
495
|
+
|
|
496
|
+
# Split text into chunks
|
|
497
|
+
logging.info("Splitting text into chunks...")
|
|
498
|
+
text_splitter = RecursiveCharacterTextSplitter(
|
|
499
|
+
chunk_size=chunk_size,
|
|
500
|
+
chunk_overlap=chunk_overlap,
|
|
501
|
+
length_function=len,
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
df["text_chunk"] = (
|
|
505
|
+
df["full_text_md"]
|
|
506
|
+
.to_pandas()
|
|
507
|
+
.astype(object)
|
|
508
|
+
.swifter.apply(text_splitter.split_text)
|
|
509
|
+
)
|
|
510
|
+
logging.info("Text split into chunks.")
|
|
511
|
+
|
|
512
|
+
# Create chunk IDs and explode chunks into rows
|
|
513
|
+
logging.info("Creating chunk IDs and exploding chunks into rows...")
|
|
514
|
+
chunk_ids = [
|
|
515
|
+
str(idx) for text_chunk in df["text_chunk"] for idx in range(len(text_chunk))
|
|
516
|
+
]
|
|
517
|
+
df = df.explode("text_chunk").reset_index(drop=True)
|
|
518
|
+
df["chunk_id"] = df["question_id"].astype("string") + "__" + chunk_ids
|
|
519
|
+
logging.info("Chunk IDs created and chunks exploded.")
|
|
520
|
+
|
|
521
|
+
# Generate embeddings
|
|
522
|
+
logging.info("Generating embeddings...")
|
|
523
|
+
|
|
524
|
+
# The first invocation in a new project might fail due to permission propagation.
|
|
525
|
+
@backoff.on_exception(
|
|
526
|
+
backoff.expo, google.api_core.exceptions.InvalidArgument, max_tries=10
|
|
527
|
+
)
|
|
528
|
+
def create_embedder() -> llm.TextEmbeddingGenerator:
|
|
529
|
+
return llm.TextEmbeddingGenerator(model_name="text-embedding-005")
|
|
530
|
+
|
|
531
|
+
embedder = create_embedder()
|
|
532
|
+
|
|
533
|
+
embeddings_df = embedder.predict(df["text_chunk"])
|
|
534
|
+
logging.info("Embeddings generated.")
|
|
535
|
+
|
|
536
|
+
df = df.assign(
|
|
537
|
+
embedding=embeddings_df["ml_generate_embedding_result"],
|
|
538
|
+
embedding_statistics=embeddings_df["ml_generate_embedding_statistics"],
|
|
539
|
+
embedding_status=embeddings_df["ml_generate_embedding_status"],
|
|
540
|
+
creation_timestamp=datetime.now(),
|
|
541
|
+
)
|
|
542
|
+
|
|
543
|
+
# Store results in BigQuery
|
|
544
|
+
PARTITION_DATE_COLUMN = "creation_timestamp"
|
|
545
|
+
|
|
546
|
+
# Create and populate incremental table
|
|
547
|
+
logging.info("Creating and populating incremental table...")
|
|
548
|
+
create_table_if_not_exist(
|
|
549
|
+
df=df,
|
|
550
|
+
project_id=project_id,
|
|
551
|
+
dataset_id=destination_dataset,
|
|
552
|
+
table_id=destination_table,
|
|
553
|
+
partition_column=PARTITION_DATE_COLUMN,
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
if_exists_mode = "append" if is_incremental else "replace"
|
|
557
|
+
df.to_gbq(
|
|
558
|
+
destination_table=f"{destination_dataset}.{destination_table}",
|
|
559
|
+
if_exists=if_exists_mode,
|
|
560
|
+
)
|
|
561
|
+
logging.info("Incremental table created and populated.")
|
|
562
|
+
|
|
563
|
+
# Create deduplicated table
|
|
564
|
+
logging.info("Creating deduplicated table...")
|
|
565
|
+
df_questions = bpd.read_gbq(
|
|
566
|
+
f"{destination_dataset}.{destination_table}", use_cache=False
|
|
567
|
+
)
|
|
568
|
+
max_date_df = (
|
|
569
|
+
df_questions.groupby("question_id")["creation_timestamp"].max().reset_index()
|
|
570
|
+
)
|
|
571
|
+
df_questions_dedup = max_date_df.merge(
|
|
572
|
+
df_questions, how="inner", on=["question_id", "creation_timestamp"]
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
create_table_if_not_exist(
|
|
576
|
+
df=df_questions_dedup,
|
|
577
|
+
project_id=project_id,
|
|
578
|
+
dataset_id=destination_dataset,
|
|
579
|
+
table_id=deduped_table,
|
|
580
|
+
partition_column=PARTITION_DATE_COLUMN,
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
df_questions_dedup.to_gbq(
|
|
584
|
+
destination_table=f"{destination_dataset}.{deduped_table}",
|
|
585
|
+
if_exists="replace",
|
|
586
|
+
)
|
|
587
|
+
logging.info("Deduplicated table created and populated.")
|
|
588
|
+
# Set artifact metadata (important!)
|
|
589
|
+
output_table.uri = (
|
|
590
|
+
f"bq://{project_id}.{destination_dataset}.{deduped_table}" # Full BQ URI
|
|
591
|
+
)
|
|
592
|
+
output_table.metadata["projectId"] = project_id
|
|
593
|
+
output_table.metadata["datasetId"] = destination_dataset
|
|
594
|
+
output_table.metadata["tableId"] = deduped_table
|
|
595
|
+
{% endif %}
|
|
@@ -21,15 +21,22 @@ from kfp import dsl
|
|
|
21
21
|
def pipeline(
|
|
22
22
|
project_id: str,
|
|
23
23
|
location: str,
|
|
24
|
-
data_store_region: str,
|
|
25
|
-
data_store_id: str,
|
|
26
24
|
is_incremental: bool = True,
|
|
27
25
|
look_back_days: int = 1,
|
|
28
26
|
chunk_size: int = 1500,
|
|
29
27
|
chunk_overlap: int = 20,
|
|
30
|
-
destination_dataset: str = "stackoverflow_data",
|
|
31
28
|
destination_table: str = "incremental_questions_embeddings",
|
|
32
29
|
deduped_table: str = "questions_embeddings",
|
|
30
|
+
destination_dataset: str = "{{cookiecutter.project_name | replace('-', '_')}}_stackoverflow_data",
|
|
31
|
+
{%- if cookiecutter.datastore_type == "vertex_ai_search" %}
|
|
32
|
+
data_store_region: str = "",
|
|
33
|
+
data_store_id: str = "",
|
|
34
|
+
{%- elif cookiecutter.datastore_type == "vertex_ai_vector_search" %}
|
|
35
|
+
vector_search_index: str = "",
|
|
36
|
+
vector_search_index_endpoint: str = "",
|
|
37
|
+
vector_search_data_bucket_name: str = "",
|
|
38
|
+
ingestion_batch_size: int = 1000,
|
|
39
|
+
{%- endif %}
|
|
33
40
|
) -> None:
|
|
34
41
|
"""Processes data and ingests it into a datastore for RAG Retrieval"""
|
|
35
42
|
|
|
@@ -45,9 +52,10 @@ def pipeline(
|
|
|
45
52
|
destination_table=destination_table,
|
|
46
53
|
deduped_table=deduped_table,
|
|
47
54
|
location=location,
|
|
48
|
-
|
|
55
|
+
{%- if cookiecutter.datastore_type == "vertex_ai_search" %}
|
|
56
|
+
embedding_column="embedding",{% endif %}
|
|
49
57
|
).set_retry(num_retries=2)
|
|
50
|
-
|
|
58
|
+
{% if cookiecutter.datastore_type == "vertex_ai_search" %}
|
|
51
59
|
# Ingest the processed data into Vertex AI Search datastore
|
|
52
60
|
ingest_data(
|
|
53
61
|
project_id=project_id,
|
|
@@ -56,3 +64,18 @@ def pipeline(
|
|
|
56
64
|
data_store_id=data_store_id,
|
|
57
65
|
embedding_column="embedding",
|
|
58
66
|
).set_retry(num_retries=2)
|
|
67
|
+
{% elif cookiecutter.datastore_type == "vertex_ai_vector_search" %}
|
|
68
|
+
# Ingest the processed data into Vertex AI Vector Search
|
|
69
|
+
ingest_data(
|
|
70
|
+
project_id=project_id,
|
|
71
|
+
location=location,
|
|
72
|
+
vector_search_index=vector_search_index,
|
|
73
|
+
vector_search_index_endpoint=vector_search_index_endpoint,
|
|
74
|
+
vector_search_data_bucket_name=vector_search_data_bucket_name,
|
|
75
|
+
input_table=processed_data.output,
|
|
76
|
+
schedule_time=dsl.PIPELINE_JOB_SCHEDULE_TIME_UTC_PLACEHOLDER,
|
|
77
|
+
is_incremental=False,
|
|
78
|
+
look_back_days=look_back_days,
|
|
79
|
+
ingestion_batch_size=ingestion_batch_size,
|
|
80
|
+
).set_retry(num_retries=2)
|
|
81
|
+
{% endif %}
|