agent-starter-pack 0.1.7__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of agent-starter-pack might be problematic. Click here for more details.

Files changed (80) hide show
  1. {agent_starter_pack-0.1.7.dist-info → agent_starter_pack-0.2.0.dist-info}/METADATA +6 -6
  2. {agent_starter_pack-0.1.7.dist-info → agent_starter_pack-0.2.0.dist-info}/RECORD +77 -77
  3. agents/{agentic_rag_vertexai_search → agentic_rag}/README.md +3 -3
  4. agents/{agentic_rag_vertexai_search → agentic_rag}/app/agent.py +22 -6
  5. agents/agentic_rag/app/retrievers.py +132 -0
  6. agents/{agentic_rag_vertexai_search → agentic_rag}/notebooks/evaluating_langgraph_agent.ipynb +3 -3
  7. agents/{agentic_rag_vertexai_search → agentic_rag}/template/.templateconfig.yaml +3 -5
  8. agents/crewai_coding_crew/notebooks/evaluating_crewai_agent.ipynb +4 -4
  9. agents/crewai_coding_crew/notebooks/evaluating_langgraph_agent.ipynb +3 -3
  10. agents/langgraph_base_react/notebooks/evaluating_langgraph_agent.ipynb +3 -3
  11. agents/{multimodal_live_api → live_api}/README.md +7 -0
  12. agents/{multimodal_live_api → live_api}/app/agent.py +3 -11
  13. agents/{multimodal_live_api → live_api}/app/server.py +3 -2
  14. agents/{multimodal_live_api → live_api}/template/.templateconfig.yaml +2 -2
  15. src/base_template/Makefile +12 -7
  16. src/base_template/README.md +71 -71
  17. src/base_template/app/utils/tracing.py +3 -1
  18. src/base_template/app/utils/typing.py +1 -0
  19. src/base_template/deployment/cd/deploy-to-prod.yaml +10 -4
  20. src/base_template/deployment/cd/staging.yaml +11 -10
  21. src/base_template/deployment/ci/pr_checks.yaml +1 -1
  22. src/base_template/deployment/terraform/apis.tf +6 -0
  23. src/base_template/deployment/terraform/build_triggers.tf +34 -21
  24. src/base_template/deployment/terraform/dev/iam.tf +13 -6
  25. src/base_template/deployment/terraform/dev/log_sinks.tf +25 -28
  26. src/base_template/deployment/terraform/dev/providers.tf +1 -0
  27. src/base_template/deployment/terraform/dev/storage.tf +69 -11
  28. src/base_template/deployment/terraform/dev/variables.tf +50 -53
  29. src/base_template/deployment/terraform/dev/vars/env.tfvars +13 -11
  30. src/base_template/deployment/terraform/iam.tf +3 -3
  31. src/base_template/deployment/terraform/log_sinks.tf +24 -26
  32. src/base_template/deployment/terraform/providers.tf +2 -0
  33. src/base_template/deployment/terraform/service_accounts.tf +7 -7
  34. src/base_template/deployment/terraform/storage.tf +123 -11
  35. src/base_template/deployment/terraform/variables.tf +49 -70
  36. src/base_template/deployment/terraform/vars/env.tfvars +12 -17
  37. src/base_template/pyproject.toml +4 -3
  38. src/cli/commands/create.py +79 -19
  39. src/cli/commands/setup_cicd.py +91 -22
  40. src/cli/main.py +3 -1
  41. src/cli/utils/__init__.py +9 -2
  42. src/cli/utils/cicd.py +12 -0
  43. src/cli/utils/datastores.py +32 -0
  44. src/cli/utils/gcp.py +4 -6
  45. src/cli/utils/template.py +127 -45
  46. src/cli/utils/version.py +87 -0
  47. src/data_ingestion/README.md +24 -19
  48. src/data_ingestion/data_ingestion_pipeline/components/ingest_data.py +135 -2
  49. src/data_ingestion/data_ingestion_pipeline/components/process_data.py +276 -2
  50. src/data_ingestion/data_ingestion_pipeline/pipeline.py +28 -5
  51. src/data_ingestion/data_ingestion_pipeline/submit_pipeline.py +49 -14
  52. src/data_ingestion/pyproject.toml +1 -0
  53. src/deployment_targets/agent_engine/app/agent_engine_app.py +3 -1
  54. src/deployment_targets/cloud_run/tests/unit/test_server.py +15 -33
  55. src/frontends/live_api_react/frontend/package-lock.json +208 -168
  56. src/frontends/live_api_react/frontend/package.json +1 -1
  57. src/resources/containers/data_processing/Dockerfile +3 -1
  58. src/resources/locks/{uv-agentic_rag_vertexai_search-agent_engine.lock → uv-agentic_rag-agent_engine.lock} +747 -694
  59. src/resources/locks/{uv-agentic_rag_vertexai_search-cloud_run.lock → uv-agentic_rag-cloud_run.lock} +944 -806
  60. src/resources/locks/uv-crewai_coding_crew-agent_engine.lock +651 -694
  61. src/resources/locks/uv-crewai_coding_crew-cloud_run.lock +813 -789
  62. src/resources/locks/uv-langgraph_base_react-agent_engine.lock +666 -686
  63. src/resources/locks/uv-langgraph_base_react-cloud_run.lock +848 -798
  64. src/resources/locks/{uv-multimodal_live_api-cloud_run.lock → uv-live_api-cloud_run.lock} +856 -791
  65. src/resources/setup_cicd/cicd_variables.tf +5 -0
  66. src/resources/setup_cicd/github.tf +4 -2
  67. src/utils/watch_and_rebuild.py +14 -0
  68. agents/agentic_rag_vertexai_search/app/retrievers.py +0 -79
  69. src/deployment_targets/cloud_run/deployment/terraform/artifact_registry.tf +0 -22
  70. src/deployment_targets/cloud_run/deployment/terraform/dev/service_accounts.tf +0 -20
  71. {agent_starter_pack-0.1.7.dist-info → agent_starter_pack-0.2.0.dist-info}/WHEEL +0 -0
  72. {agent_starter_pack-0.1.7.dist-info → agent_starter_pack-0.2.0.dist-info}/entry_points.txt +0 -0
  73. {agent_starter_pack-0.1.7.dist-info → agent_starter_pack-0.2.0.dist-info}/licenses/LICENSE +0 -0
  74. /agents/{agentic_rag_vertexai_search → agentic_rag}/app/templates.py +0 -0
  75. /agents/{agentic_rag_vertexai_search → agentic_rag}/tests/integration/test_agent.py +0 -0
  76. /agents/{multimodal_live_api → live_api}/app/templates.py +0 -0
  77. /agents/{multimodal_live_api → live_api}/app/vector_store.py +0 -0
  78. /agents/{multimodal_live_api → live_api}/tests/integration/test_server_e2e.py +0 -0
  79. /agents/{multimodal_live_api → live_api}/tests/load_test/load_test.py +0 -0
  80. /agents/{multimodal_live_api → live_api}/tests/unit/test_server.py +0 -0
@@ -0,0 +1,87 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Version checking utilities for the CLI."""
16
+
17
+ import logging
18
+ from importlib.metadata import PackageNotFoundError, version
19
+
20
+ import requests
21
+ from packaging import version as pkg_version
22
+ from rich.console import Console
23
+
24
+ console = Console()
25
+
26
+ PACKAGE_NAME = "agent-starter-pack"
27
+
28
+
29
+ def get_current_version() -> str:
30
+ """Get the current installed version of the package."""
31
+ try:
32
+ return version(PACKAGE_NAME)
33
+ except PackageNotFoundError:
34
+ # For development environments where package isn't installed
35
+ return "0.0.0" # Default if version can't be determined
36
+
37
+
38
+ def get_latest_version() -> str:
39
+ """Get the latest version available on PyPI."""
40
+ try:
41
+ response = requests.get(f"https://pypi.org/pypi/{PACKAGE_NAME}/json", timeout=2)
42
+ if response.status_code == 200:
43
+ return response.json()["info"]["version"]
44
+ return "0.0.0"
45
+ except Exception:
46
+ return "0.0.0" # Default if PyPI can't be reached
47
+
48
+
49
+ def check_for_updates() -> tuple[bool, str, str]:
50
+ """Check if a newer version of the package is available.
51
+
52
+ Returns:
53
+ Tuple of (needs_update, current_version, latest_version)
54
+ """
55
+ current = get_current_version()
56
+ latest = get_latest_version()
57
+
58
+ needs_update = pkg_version.parse(latest) > pkg_version.parse(current)
59
+
60
+ return needs_update, current, latest
61
+
62
+
63
+ def display_update_message() -> None:
64
+ """Check for updates and display a message if an update is available."""
65
+ try:
66
+ needs_update, current, latest = check_for_updates()
67
+
68
+ if needs_update:
69
+ console.print(
70
+ f"\n[yellow]⚠️ Update available: {current} → {latest}[/]",
71
+ highlight=False,
72
+ )
73
+ console.print(
74
+ f"[yellow]Run `pip install --upgrade {PACKAGE_NAME}` to update.",
75
+ highlight=False,
76
+ )
77
+ console.print(
78
+ f"[yellow]Or, if you used pipx: `pipx upgrade {PACKAGE_NAME}`",
79
+ highlight=False,
80
+ )
81
+ console.print(
82
+ f"[yellow]Or, if you used uv: `uv pip install --upgrade {PACKAGE_NAME}`",
83
+ highlight=False,
84
+ )
85
+ except Exception as e:
86
+ # Don't let version checking errors affect the CLI
87
+ logging.debug(f"Error checking for updates: {e}")
@@ -1,8 +1,8 @@
1
1
  # Data Ingestion Pipeline
2
2
 
3
- This pipeline automates the ingestion of data into Vertex AI Search, streamlining the process of building Retrieval Augmented Generation (RAG) applications.
3
+ This pipeline automates the ingestion of data into Vertex AI{%- if cookiecutter.datastore_type == "vertex_ai_vector_search" %} Vector{%- endif %} Search, streamlining the process of building Retrieval Augmented Generation (RAG) applications.
4
4
 
5
- It orchestrates the complete workflow: loading data, chunking it into manageable segments, generating embeddings using Vertex AI Embeddings, and importing the processed data into your Vertex AI Search datastore.
5
+ It orchestrates the complete workflow: loading data, chunking it into manageable segments, generating embeddings using Vertex AI Embeddings, and importing the processed data into your Vertex AI{%- if cookiecutter.datastore_type == "vertex_ai_vector_search" %} Vector{%- endif %} Search datastore.
6
6
 
7
7
  You can trigger the pipeline for an initial data load or schedule it to run periodically, ensuring your search index remains current. Vertex AI Pipelines provides the orchestration and monitoring capabilities for this process.
8
8
 
@@ -37,7 +37,7 @@ uv sync --frozen
37
37
  **c. Execute the Pipeline:**
38
38
 
39
39
  Run the following command to execute the data ingestion pipeline. Replace the placeholder values with your actual project details.
40
-
40
+ {%- if cookiecutter.datastore_type == "vertex_ai_search" %}
41
41
  ```bash
42
42
  PROJECT_ID="YOUR_PROJECT_ID"
43
43
  REGION="us-central1"
@@ -47,22 +47,26 @@ uv run data_ingestion_pipeline/submit_pipeline.py \
47
47
  --region=$REGION \
48
48
  --data-store-region=$DATA_STORE_REGION \
49
49
  --data-store-id="sample-datastore" \
50
- --service-account="vertexai-pipelines-sa@$PROJECT_ID.iam.gserviceaccount.com" \
51
- --pipeline-root="gs://$PROJECT_ID-pipeline-artifacts" \
50
+ --service-account="{{cookiecutter.project_name}}-rag@$PROJECT_ID.iam.gserviceaccount.com" \
51
+ --pipeline-root="gs://$PROJECT_ID-{{cookiecutter.project_name}}-rag" \
52
52
  --pipeline-name="data-ingestion-pipeline"
53
53
  ```
54
-
55
- **Parameter Explanation:**
56
-
57
- * `--project-id`: Your Google Cloud project ID.
58
- * `--region`: The region where Vertex AI Pipelines will run (e.g., `us-central1`).
59
- * `--data-store-region`: The region for Vertex AI Search operations (e.g., `us` or `eu`).
60
- * `--data-store-id`: The ID of your Vertex AI Search datastore.
61
- * `--service-account`: The service account email used for pipeline execution. Ensure this service account has the necessary permissions (e.g., Vertex AI User, Storage Object Admin).
62
- * `--pipeline-root`: The Google Cloud Storage (GCS) bucket for storing pipeline artifacts.
63
- * `--pipeline-name`: A descriptive name for your pipeline.
64
- * `--schedule-only` (Optional): If specified, the pipeline will only be scheduled and not executed immediately. Requires `--cron-schedule`.
65
- * `--cron-schedule` (Optional): A cron expression defining the pipeline's schedule (e.g., `"0 9 * * 1"` for every Monday at 9:00 AM UTC).
54
+ {%- elif cookiecutter.datastore_type == "vertex_ai_vector_search" %}
55
+ ```bash
56
+ PROJECT_ID="YOUR_PROJECT_ID"
57
+ REGION="us-central1"
58
+ VECTOR_SEARCH_INDEX="YOUR_VECTOR_SEARCH_INDEX"
59
+ VECTOR_SEARCH_INDEX_ENDPOINT="YOUR_VECTOR_SEARCH_INDEX_ENDPOINT"
60
+ uv run data_ingestion_pipeline/submit_pipeline.py \
61
+ --project-id=$PROJECT_ID \
62
+ --region=$REGION \
63
+ --vector-search-index=$VECTOR_SEARCH_INDEX \
64
+ --vector-search-index-endpoint=$VECTOR_SEARCH_INDEX_ENDPOINT \
65
+ --service-account="{{cookiecutter.project_name}}-rag@$PROJECT_ID.iam.gserviceaccount.com" \
66
+ --pipeline-root="gs://$PROJECT_ID-{{cookiecutter.project_name}}-rag" \
67
+ --pipeline-name="data-ingestion-pipeline"
68
+ ```
69
+ {%- endif %}
66
70
 
67
71
  **d. Pipeline Scheduling and Execution:**
68
72
 
@@ -74,6 +78,7 @@ The pipeline's configuration and execution status will be printed to the console
74
78
 
75
79
  ## Testing Your RAG Application
76
80
 
77
- Once the data ingestion pipeline completes successfully, you can test your RAG application with Vertex AI Search.
78
-
81
+ Once the data ingestion pipeline completes successfully, you can test your RAG application with Vertex AI{%- if cookiecutter.datastore_type == "vertex_ai_vector_search" %} Vector{%- endif %} Search.
82
+ {%- if cookiecutter.datastore_type == "vertex_ai_search" %}
79
83
  > **Troubleshooting:** If you encounter the error `"google.api_core.exceptions.InvalidArgument: 400 The embedding field path: embedding not found in schema"` after the initial data ingestion, wait a few minutes and try again. This delay allows Vertex AI Search to fully index the ingested data.
84
+ {%- endif %}
@@ -11,12 +11,13 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+ # ruff: noqa
14
15
 
15
16
  from kfp.dsl import Dataset, Input, component
16
-
17
+ {% if cookiecutter.datastore_type == "vertex_ai_search" %}
17
18
 
18
19
  @component(
19
- base_image="us-docker.pkg.dev/production-ai-template/starter-pack/data_processing:0.1"
20
+ base_image="us-docker.pkg.dev/production-ai-template/starter-pack/data_processing:0.2"
20
21
  )
21
22
  def ingest_data(
22
23
  project_id: str,
@@ -37,6 +38,7 @@ def ingest_data(
37
38
  """
38
39
  import json
39
40
  import logging
41
+ import time
40
42
 
41
43
  from google.api_core.client_options import ClientOptions
42
44
  from google.cloud import discoveryengine
@@ -173,3 +175,134 @@ def ingest_data(
173
175
  input_files_uri=input_files.uri,
174
176
  )
175
177
  logging.info("Data import completed")
178
+ logging.info(
179
+ "Sleeping for 3 minutes to allow Vertex AI Search to properly index the data..."
180
+ )
181
+ time.sleep(180) # Sleep for 180 seconds (3 minutes)
182
+ logging.info("Sleep completed. Data indexing should now be complete.")
183
+ {% elif cookiecutter.datastore_type == "vertex_ai_vector_search" %}
184
+ from google_cloud_pipeline_components.types.artifact_types import BQTable
185
+
186
+
187
+ @component(
188
+ base_image="us-docker.pkg.dev/production-ai-template/starter-pack/data_processing:0.2"
189
+ )
190
+ def ingest_data(
191
+ project_id: str,
192
+ location: str,
193
+ vector_search_index: str,
194
+ vector_search_index_endpoint: str,
195
+ vector_search_data_bucket_name: str,
196
+ schedule_time: str,
197
+ ingestion_batch_size: int,
198
+ input_table: Input[BQTable],
199
+ is_incremental: bool = True,
200
+ look_back_days: int = 1,
201
+ ) -> None:
202
+ """Process and ingest documents into Vertex AI Vector Search.
203
+
204
+ Args:
205
+ project_id: Google Cloud project ID
206
+ """
207
+ import logging
208
+ from datetime import datetime, timedelta
209
+
210
+ import bigframes.pandas as bpd
211
+ from google.cloud import aiplatform
212
+ from langchain_google_vertexai import VectorSearchVectorStore
213
+ from langchain_google_vertexai import VertexAIEmbeddings
214
+
215
+ # Initialize logging
216
+ logging.basicConfig(level=logging.INFO)
217
+
218
+ # Initialize clients
219
+ logging.info("Initializing clients...")
220
+ bpd.options.bigquery.project = project_id
221
+ bpd.options.bigquery.location = location
222
+ logging.info("Clients initialized.")
223
+
224
+ # Set date range for data fetch
225
+ schedule_time_dt: datetime = datetime.fromisoformat(
226
+ schedule_time.replace("Z", "+00:00")
227
+ )
228
+ if schedule_time_dt.year == 1970:
229
+ logging.warning(
230
+ "Pipeline schedule not set. Setting schedule_time to current date."
231
+ )
232
+ schedule_time_dt = datetime.now()
233
+
234
+ # Note: The following line sets the schedule time 5 years back to allow sample data to be present.
235
+ # For your use case, please comment out the following line to use the actual schedule time.
236
+ schedule_time_dt = schedule_time_dt - timedelta(days=5 * 365)
237
+
238
+ START_DATE: datetime = schedule_time_dt - timedelta(
239
+ days=look_back_days
240
+ ) # Start date for data processing window
241
+ END_DATE: datetime = schedule_time_dt # End date for data processing window
242
+
243
+ logging.info(f"Date range set: START_DATE={START_DATE}, END_DATE={END_DATE}")
244
+
245
+ dataset = input_table.metadata["datasetId"]
246
+ table = input_table.metadata["tableId"]
247
+
248
+ query = f"""
249
+ SELECT
250
+ question_id
251
+ , last_edit_date
252
+ , full_text_md
253
+ , text_chunk
254
+ , chunk_id
255
+ , embedding
256
+ FROM {project_id}.{dataset}.{table}
257
+ WHERE TRUE
258
+ {f'AND DATETIME(creation_timestamp) BETWEEN DATETIME("{START_DATE}") AND DATETIME("{END_DATE}")' if is_incremental else ""}
259
+ """
260
+ df = (
261
+ bpd.read_gbq(query)
262
+ .sort_values("last_edit_date", ascending=False)
263
+ .drop_duplicates("question_id")
264
+ .reset_index(drop=True)
265
+ )
266
+
267
+ aiplatform.init(
268
+ project=project_id,
269
+ location=location,
270
+ staging_bucket=vector_search_data_bucket_name,
271
+ )
272
+
273
+ embedding_model = VertexAIEmbeddings(model_name="text-embedding-005")
274
+ my_index = aiplatform.MatchingEngineIndex(vector_search_index)
275
+ my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint(
276
+ vector_search_index_endpoint
277
+ )
278
+ vector_store = VectorSearchVectorStore.from_components(
279
+ project_id=project_id,
280
+ region=location,
281
+ gcs_bucket_name=vector_search_data_bucket_name.replace("gs://", ""),
282
+ index_id=my_index.name,
283
+ endpoint_id=my_index_endpoint.name,
284
+ embedding=embedding_model,
285
+ stream_update=True,
286
+ )
287
+
288
+ for batch_num, start in enumerate(range(0, len(df), ingestion_batch_size)):
289
+ ids = (
290
+ df.iloc[start : start + ingestion_batch_size]
291
+ .question_id.astype(str)
292
+ .tolist()
293
+ )
294
+ texts = df.iloc[start : start + ingestion_batch_size].text_chunk.tolist()
295
+ embeddings = df.iloc[start : start + ingestion_batch_size].embedding.tolist()
296
+ metadatas = (
297
+ df.iloc[start : start + ingestion_batch_size]
298
+ .drop(columns=["embedding", "last_edit_date"])
299
+ .to_dict(orient="records")
300
+ )
301
+ vector_store.add_texts_with_embeddings(
302
+ ids=ids,
303
+ texts=texts,
304
+ embeddings=embeddings,
305
+ metadatas=metadatas,
306
+ is_complete_overwrite=True,
307
+ )
308
+ {% endif %}
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+ # ruff: noqa
14
15
 
15
16
  """
16
17
  This component is derived from the notebook:
@@ -21,9 +22,9 @@ It leverages BigQuery for data processing. We also suggest looking at remote fun
21
22
 
22
23
  from kfp.dsl import Dataset, Output, component
23
24
 
24
-
25
+ {% if cookiecutter.datastore_type == "vertex_ai_search" %}
25
26
  @component(
26
- base_image="us-docker.pkg.dev/production-ai-template/starter-pack/data_processing:0.1"
27
+ base_image="us-docker.pkg.dev/production-ai-template/starter-pack/data_processing:0.2"
27
28
  )
28
29
  def process_data(
29
30
  project_id: str,
@@ -319,3 +320,276 @@ def process_data(
319
320
  )
320
321
  extract_job.result()
321
322
  logging.info("Exported to JSONL.")
323
+ {% elif cookiecutter.datastore_type == "vertex_ai_vector_search" %}
324
+ from google_cloud_pipeline_components.types.artifact_types import BQTable
325
+
326
+
327
+ @component(
328
+ base_image="us-docker.pkg.dev/production-ai-template/starter-pack/data_processing:0.2",
329
+ )
330
+ def process_data(
331
+ project_id: str,
332
+ schedule_time: str,
333
+ output_table: Output[BQTable],
334
+ is_incremental: bool = True,
335
+ look_back_days: int = 1,
336
+ chunk_size: int = 1500,
337
+ chunk_overlap: int = 20,
338
+ destination_dataset: str = "stackoverflow_data",
339
+ destination_table: str = "incremental_questions_embeddings",
340
+ deduped_table: str = "questions_embeddings",
341
+ location: str = "us-central1",
342
+ ) -> None:
343
+ """Process StackOverflow questions and answers by:
344
+ 1. Fetching data from BigQuery
345
+ 2. Converting HTML to markdown
346
+ 3. Splitting text into chunks
347
+ 4. Generating embeddings
348
+ 5. Storing results in BigQuery
349
+ 6. Exporting to JSONL
350
+
351
+ Args:
352
+ output_files: Output dataset path
353
+ is_incremental: Whether to process only recent data
354
+ look_back_days: Number of days to look back for incremental processing
355
+ chunk_size: Size of text chunks
356
+ chunk_overlap: Overlap between chunks
357
+ destination_dataset: BigQuery dataset for storing results
358
+ destination_table: Table for storing incremental results
359
+ deduped_table: Table for storing deduplicated results
360
+ location: BigQuery location
361
+ """
362
+ import logging
363
+ from datetime import datetime, timedelta
364
+
365
+ import backoff
366
+ import bigframes.ml.llm as llm
367
+ import bigframes.pandas as bpd
368
+ import google.api_core.exceptions
369
+ import swifter
370
+ from google.cloud import bigquery
371
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
372
+ from markdownify import markdownify
373
+
374
+ # Initialize logging
375
+ logging.basicConfig(level=logging.INFO)
376
+ logging.info(f"Using {swifter} for apply operations.")
377
+
378
+ # Initialize clients
379
+ logging.info("Initializing clients...")
380
+ bq_client = bigquery.Client(project=project_id, location=location)
381
+ bpd.options.bigquery.project = project_id
382
+ bpd.options.bigquery.location = location
383
+ logging.info("Clients initialized.")
384
+
385
+ # Set date range for data fetch
386
+ schedule_time_dt: datetime = datetime.fromisoformat(
387
+ schedule_time.replace("Z", "+00:00")
388
+ )
389
+ if schedule_time_dt.year == 1970:
390
+ logging.warning(
391
+ "Pipeline schedule not set. Setting schedule_time to current date."
392
+ )
393
+ schedule_time_dt = datetime.now()
394
+
395
+ # Note: The following line sets the schedule time 5 years back to allow sample data to be present.
396
+ # For your use case, please comment out the following line to use the actual schedule time.
397
+ schedule_time_dt = schedule_time_dt - timedelta(days=5 * 365)
398
+
399
+ START_DATE: datetime = schedule_time_dt - timedelta(
400
+ days=look_back_days
401
+ ) # Start date for data processing window
402
+ END_DATE: datetime = schedule_time_dt # End date for data processing window
403
+
404
+ logging.info(f"Date range set: START_DATE={START_DATE}, END_DATE={END_DATE}")
405
+
406
+ def fetch_stackoverflow_data(
407
+ dataset_suffix: str, start_date: str, end_date: str
408
+ ) -> bpd.DataFrame:
409
+ """Fetch StackOverflow data from BigQuery."""
410
+ query = f"""
411
+ SELECT
412
+ creation_date,
413
+ last_edit_date,
414
+ question_id,
415
+ question_title,
416
+ question_body AS question_text,
417
+ answers
418
+ FROM `production-ai-template.stackoverflow_qa_{dataset_suffix}.stackoverflow_python_questions_and_answers`
419
+ WHERE TRUE
420
+ {f'AND TIMESTAMP_TRUNC(creation_date, DAY) BETWEEN TIMESTAMP("{start_date}") AND TIMESTAMP("{end_date}")' if is_incremental else ""}
421
+ """
422
+ logging.info("Fetching StackOverflow data from BigQuery...")
423
+ return bpd.read_gbq(query)
424
+
425
+ def convert_html_to_markdown(html: str) -> str:
426
+ """Convert HTML into Markdown for easier parsing and rendering after LLM response."""
427
+ return markdownify(html).strip()
428
+
429
+ def create_answers_markdown(answers: list) -> str:
430
+ """Convert each answer's HTML to markdown and concatenate into a single markdown text."""
431
+ answers_md = ""
432
+ for index, answer_record in enumerate(answers):
433
+ answers_md += (
434
+ f"\n\n## Answer {index + 1}:\n" # Answer number is H2 heading size
435
+ )
436
+ answers_md += convert_html_to_markdown(answer_record["body"])
437
+ return answers_md
438
+
439
+ def create_table_if_not_exist(
440
+ df: bpd.DataFrame,
441
+ project_id: str,
442
+ dataset_id: str,
443
+ table_id: str,
444
+ partition_column: str,
445
+ location: str = location,
446
+ ) -> None:
447
+ """Create BigQuery table with time partitioning if it doesn't exist."""
448
+ table_schema = bq_client.get_table(df.head(0).to_gbq()).schema
449
+ table = bigquery.Table(
450
+ f"{project_id}.{dataset_id}.{table_id}", schema=table_schema
451
+ )
452
+ table.time_partitioning = bigquery.TimePartitioning(
453
+ type_=bigquery.TimePartitioningType.DAY, field=partition_column
454
+ )
455
+
456
+ dataset = bigquery.Dataset(f"{project_id}.{dataset_id}")
457
+ dataset.location = location
458
+ bq_client.create_dataset(dataset, exists_ok=True)
459
+ bq_client.create_table(table=table, exists_ok=True)
460
+
461
+ # Fetch and preprocess data
462
+ logging.info("Fetching and preprocessing data...")
463
+ df = fetch_stackoverflow_data(
464
+ start_date=START_DATE.strftime("%Y-%m-%d"),
465
+ end_date=END_DATE.strftime("%Y-%m-%d"),
466
+ dataset_suffix=location.lower().replace("-", "_"),
467
+ )
468
+ df = (
469
+ df.sort_values("last_edit_date", ascending=False)
470
+ .drop_duplicates("question_id")
471
+ .reset_index(drop=True)
472
+ )
473
+ logging.info("Data fetched and preprocessed.")
474
+
475
+ # Convert content to markdown
476
+ logging.info("Converting content to markdown...")
477
+
478
+ # Create markdown fields efficiently
479
+ df["question_title_md"] = (
480
+ "# " + df["question_title"] + "\n"
481
+ ) # Title is H1 heading size
482
+ df["question_text_md"] = (
483
+ df["question_text"].to_pandas().swifter.apply(convert_html_to_markdown) + "\n"
484
+ )
485
+ df["answers_md"] = df["answers"].to_pandas().swifter.apply(create_answers_markdown)
486
+
487
+ # Create a column containing the whole markdown text
488
+ df["full_text_md"] = (
489
+ df["question_title_md"] + df["question_text_md"] + df["answers_md"]
490
+ )
491
+ logging.info("Content converted to markdown.")
492
+
493
+ # Keep only necessary columns
494
+ df = df[["last_edit_date", "question_id", "question_text", "full_text_md"]]
495
+
496
+ # Split text into chunks
497
+ logging.info("Splitting text into chunks...")
498
+ text_splitter = RecursiveCharacterTextSplitter(
499
+ chunk_size=chunk_size,
500
+ chunk_overlap=chunk_overlap,
501
+ length_function=len,
502
+ )
503
+
504
+ df["text_chunk"] = (
505
+ df["full_text_md"]
506
+ .to_pandas()
507
+ .astype(object)
508
+ .swifter.apply(text_splitter.split_text)
509
+ )
510
+ logging.info("Text split into chunks.")
511
+
512
+ # Create chunk IDs and explode chunks into rows
513
+ logging.info("Creating chunk IDs and exploding chunks into rows...")
514
+ chunk_ids = [
515
+ str(idx) for text_chunk in df["text_chunk"] for idx in range(len(text_chunk))
516
+ ]
517
+ df = df.explode("text_chunk").reset_index(drop=True)
518
+ df["chunk_id"] = df["question_id"].astype("string") + "__" + chunk_ids
519
+ logging.info("Chunk IDs created and chunks exploded.")
520
+
521
+ # Generate embeddings
522
+ logging.info("Generating embeddings...")
523
+
524
+ # The first invocation in a new project might fail due to permission propagation.
525
+ @backoff.on_exception(
526
+ backoff.expo, google.api_core.exceptions.InvalidArgument, max_tries=10
527
+ )
528
+ def create_embedder() -> llm.TextEmbeddingGenerator:
529
+ return llm.TextEmbeddingGenerator(model_name="text-embedding-005")
530
+
531
+ embedder = create_embedder()
532
+
533
+ embeddings_df = embedder.predict(df["text_chunk"])
534
+ logging.info("Embeddings generated.")
535
+
536
+ df = df.assign(
537
+ embedding=embeddings_df["ml_generate_embedding_result"],
538
+ embedding_statistics=embeddings_df["ml_generate_embedding_statistics"],
539
+ embedding_status=embeddings_df["ml_generate_embedding_status"],
540
+ creation_timestamp=datetime.now(),
541
+ )
542
+
543
+ # Store results in BigQuery
544
+ PARTITION_DATE_COLUMN = "creation_timestamp"
545
+
546
+ # Create and populate incremental table
547
+ logging.info("Creating and populating incremental table...")
548
+ create_table_if_not_exist(
549
+ df=df,
550
+ project_id=project_id,
551
+ dataset_id=destination_dataset,
552
+ table_id=destination_table,
553
+ partition_column=PARTITION_DATE_COLUMN,
554
+ )
555
+
556
+ if_exists_mode = "append" if is_incremental else "replace"
557
+ df.to_gbq(
558
+ destination_table=f"{destination_dataset}.{destination_table}",
559
+ if_exists=if_exists_mode,
560
+ )
561
+ logging.info("Incremental table created and populated.")
562
+
563
+ # Create deduplicated table
564
+ logging.info("Creating deduplicated table...")
565
+ df_questions = bpd.read_gbq(
566
+ f"{destination_dataset}.{destination_table}", use_cache=False
567
+ )
568
+ max_date_df = (
569
+ df_questions.groupby("question_id")["creation_timestamp"].max().reset_index()
570
+ )
571
+ df_questions_dedup = max_date_df.merge(
572
+ df_questions, how="inner", on=["question_id", "creation_timestamp"]
573
+ )
574
+
575
+ create_table_if_not_exist(
576
+ df=df_questions_dedup,
577
+ project_id=project_id,
578
+ dataset_id=destination_dataset,
579
+ table_id=deduped_table,
580
+ partition_column=PARTITION_DATE_COLUMN,
581
+ )
582
+
583
+ df_questions_dedup.to_gbq(
584
+ destination_table=f"{destination_dataset}.{deduped_table}",
585
+ if_exists="replace",
586
+ )
587
+ logging.info("Deduplicated table created and populated.")
588
+ # Set artifact metadata (important!)
589
+ output_table.uri = (
590
+ f"bq://{project_id}.{destination_dataset}.{deduped_table}" # Full BQ URI
591
+ )
592
+ output_table.metadata["projectId"] = project_id
593
+ output_table.metadata["datasetId"] = destination_dataset
594
+ output_table.metadata["tableId"] = deduped_table
595
+ {% endif %}
@@ -21,15 +21,22 @@ from kfp import dsl
21
21
  def pipeline(
22
22
  project_id: str,
23
23
  location: str,
24
- data_store_region: str,
25
- data_store_id: str,
26
24
  is_incremental: bool = True,
27
25
  look_back_days: int = 1,
28
26
  chunk_size: int = 1500,
29
27
  chunk_overlap: int = 20,
30
- destination_dataset: str = "stackoverflow_data",
31
28
  destination_table: str = "incremental_questions_embeddings",
32
29
  deduped_table: str = "questions_embeddings",
30
+ destination_dataset: str = "{{cookiecutter.project_name | replace('-', '_')}}_stackoverflow_data",
31
+ {%- if cookiecutter.datastore_type == "vertex_ai_search" %}
32
+ data_store_region: str = "",
33
+ data_store_id: str = "",
34
+ {%- elif cookiecutter.datastore_type == "vertex_ai_vector_search" %}
35
+ vector_search_index: str = "",
36
+ vector_search_index_endpoint: str = "",
37
+ vector_search_data_bucket_name: str = "",
38
+ ingestion_batch_size: int = 1000,
39
+ {%- endif %}
33
40
  ) -> None:
34
41
  """Processes data and ingests it into a datastore for RAG Retrieval"""
35
42
 
@@ -45,9 +52,10 @@ def pipeline(
45
52
  destination_table=destination_table,
46
53
  deduped_table=deduped_table,
47
54
  location=location,
48
- embedding_column="embedding",
55
+ {%- if cookiecutter.datastore_type == "vertex_ai_search" %}
56
+ embedding_column="embedding",{% endif %}
49
57
  ).set_retry(num_retries=2)
50
-
58
+ {% if cookiecutter.datastore_type == "vertex_ai_search" %}
51
59
  # Ingest the processed data into Vertex AI Search datastore
52
60
  ingest_data(
53
61
  project_id=project_id,
@@ -56,3 +64,18 @@ def pipeline(
56
64
  data_store_id=data_store_id,
57
65
  embedding_column="embedding",
58
66
  ).set_retry(num_retries=2)
67
+ {% elif cookiecutter.datastore_type == "vertex_ai_vector_search" %}
68
+ # Ingest the processed data into Vertex AI Vector Search
69
+ ingest_data(
70
+ project_id=project_id,
71
+ location=location,
72
+ vector_search_index=vector_search_index,
73
+ vector_search_index_endpoint=vector_search_index_endpoint,
74
+ vector_search_data_bucket_name=vector_search_data_bucket_name,
75
+ input_table=processed_data.output,
76
+ schedule_time=dsl.PIPELINE_JOB_SCHEDULE_TIME_UTC_PLACEHOLDER,
77
+ is_incremental=False,
78
+ look_back_days=look_back_days,
79
+ ingestion_batch_size=ingestion_batch_size,
80
+ ).set_retry(num_retries=2)
81
+ {% endif %}