agent-starter-pack 0.0.1b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of agent-starter-pack might be problematic. Click here for more details.

Files changed (162) hide show
  1. agent_starter_pack-0.0.1b0.dist-info/METADATA +143 -0
  2. agent_starter_pack-0.0.1b0.dist-info/RECORD +162 -0
  3. agent_starter_pack-0.0.1b0.dist-info/WHEEL +4 -0
  4. agent_starter_pack-0.0.1b0.dist-info/entry_points.txt +2 -0
  5. agent_starter_pack-0.0.1b0.dist-info/licenses/LICENSE +201 -0
  6. agents/agentic_rag_vertexai_search/README.md +22 -0
  7. agents/agentic_rag_vertexai_search/app/agent.py +145 -0
  8. agents/agentic_rag_vertexai_search/app/retrievers.py +79 -0
  9. agents/agentic_rag_vertexai_search/app/templates.py +53 -0
  10. agents/agentic_rag_vertexai_search/notebooks/evaluating_langgraph_agent.ipynb +1561 -0
  11. agents/agentic_rag_vertexai_search/template/.templateconfig.yaml +14 -0
  12. agents/agentic_rag_vertexai_search/tests/integration/test_agent.py +57 -0
  13. agents/crewai_coding_crew/README.md +34 -0
  14. agents/crewai_coding_crew/app/agent.py +86 -0
  15. agents/crewai_coding_crew/app/crew/config/agents.yaml +39 -0
  16. agents/crewai_coding_crew/app/crew/config/tasks.yaml +37 -0
  17. agents/crewai_coding_crew/app/crew/crew.py +71 -0
  18. agents/crewai_coding_crew/notebooks/evaluating_crewai_agent.ipynb +1571 -0
  19. agents/crewai_coding_crew/notebooks/evaluating_langgraph_agent.ipynb +1561 -0
  20. agents/crewai_coding_crew/template/.templateconfig.yaml +12 -0
  21. agents/crewai_coding_crew/tests/integration/test_agent.py +47 -0
  22. agents/langgraph_base_react/README.md +9 -0
  23. agents/langgraph_base_react/app/agent.py +73 -0
  24. agents/langgraph_base_react/notebooks/evaluating_langgraph_agent.ipynb +1561 -0
  25. agents/langgraph_base_react/template/.templateconfig.yaml +13 -0
  26. agents/langgraph_base_react/tests/integration/test_agent.py +48 -0
  27. agents/multimodal_live_api/README.md +50 -0
  28. agents/multimodal_live_api/app/agent.py +86 -0
  29. agents/multimodal_live_api/app/server.py +193 -0
  30. agents/multimodal_live_api/app/templates.py +51 -0
  31. agents/multimodal_live_api/app/vector_store.py +55 -0
  32. agents/multimodal_live_api/template/.templateconfig.yaml +15 -0
  33. agents/multimodal_live_api/tests/integration/test_server_e2e.py +254 -0
  34. agents/multimodal_live_api/tests/load_test/load_test.py +40 -0
  35. agents/multimodal_live_api/tests/unit/test_server.py +143 -0
  36. src/base_template/.gitignore +197 -0
  37. src/base_template/Makefile +37 -0
  38. src/base_template/README.md +91 -0
  39. src/base_template/app/utils/tracing.py +143 -0
  40. src/base_template/app/utils/typing.py +115 -0
  41. src/base_template/deployment/README.md +123 -0
  42. src/base_template/deployment/cd/deploy-to-prod.yaml +98 -0
  43. src/base_template/deployment/cd/staging.yaml +215 -0
  44. src/base_template/deployment/ci/pr_checks.yaml +51 -0
  45. src/base_template/deployment/terraform/apis.tf +34 -0
  46. src/base_template/deployment/terraform/build_triggers.tf +122 -0
  47. src/base_template/deployment/terraform/dev/apis.tf +42 -0
  48. src/base_template/deployment/terraform/dev/iam.tf +90 -0
  49. src/base_template/deployment/terraform/dev/log_sinks.tf +66 -0
  50. src/base_template/deployment/terraform/dev/providers.tf +29 -0
  51. src/base_template/deployment/terraform/dev/storage.tf +76 -0
  52. src/base_template/deployment/terraform/dev/variables.tf +126 -0
  53. src/base_template/deployment/terraform/dev/vars/env.tfvars +21 -0
  54. src/base_template/deployment/terraform/iam.tf +130 -0
  55. src/base_template/deployment/terraform/locals.tf +50 -0
  56. src/base_template/deployment/terraform/log_sinks.tf +72 -0
  57. src/base_template/deployment/terraform/providers.tf +35 -0
  58. src/base_template/deployment/terraform/service_accounts.tf +42 -0
  59. src/base_template/deployment/terraform/storage.tf +100 -0
  60. src/base_template/deployment/terraform/variables.tf +202 -0
  61. src/base_template/deployment/terraform/vars/env.tfvars +43 -0
  62. src/base_template/pyproject.toml +113 -0
  63. src/base_template/tests/unit/test_utils/test_tracing_exporter.py +140 -0
  64. src/cli/commands/create.py +534 -0
  65. src/cli/commands/setup_cicd.py +730 -0
  66. src/cli/main.py +35 -0
  67. src/cli/utils/__init__.py +35 -0
  68. src/cli/utils/cicd.py +662 -0
  69. src/cli/utils/gcp.py +120 -0
  70. src/cli/utils/logging.py +51 -0
  71. src/cli/utils/template.py +644 -0
  72. src/data_ingestion/README.md +79 -0
  73. src/data_ingestion/data_ingestion_pipeline/components/ingest_data.py +175 -0
  74. src/data_ingestion/data_ingestion_pipeline/components/process_data.py +321 -0
  75. src/data_ingestion/data_ingestion_pipeline/pipeline.py +58 -0
  76. src/data_ingestion/data_ingestion_pipeline/submit_pipeline.py +184 -0
  77. src/data_ingestion/pyproject.toml +17 -0
  78. src/data_ingestion/uv.lock +999 -0
  79. src/deployment_targets/agent_engine/app/agent_engine_app.py +238 -0
  80. src/deployment_targets/agent_engine/app/utils/gcs.py +42 -0
  81. src/deployment_targets/agent_engine/deployment_metadata.json +4 -0
  82. src/deployment_targets/agent_engine/notebooks/intro_reasoning_engine.ipynb +869 -0
  83. src/deployment_targets/agent_engine/tests/integration/test_agent_engine_app.py +120 -0
  84. src/deployment_targets/agent_engine/tests/load_test/.results/.placeholder +0 -0
  85. src/deployment_targets/agent_engine/tests/load_test/.results/report.html +264 -0
  86. src/deployment_targets/agent_engine/tests/load_test/.results/results_exceptions.csv +1 -0
  87. src/deployment_targets/agent_engine/tests/load_test/.results/results_failures.csv +1 -0
  88. src/deployment_targets/agent_engine/tests/load_test/.results/results_stats.csv +3 -0
  89. src/deployment_targets/agent_engine/tests/load_test/.results/results_stats_history.csv +22 -0
  90. src/deployment_targets/agent_engine/tests/load_test/README.md +42 -0
  91. src/deployment_targets/agent_engine/tests/load_test/load_test.py +100 -0
  92. src/deployment_targets/agent_engine/tests/unit/test_dummy.py +22 -0
  93. src/deployment_targets/cloud_run/Dockerfile +29 -0
  94. src/deployment_targets/cloud_run/app/server.py +128 -0
  95. src/deployment_targets/cloud_run/deployment/terraform/artifact_registry.tf +22 -0
  96. src/deployment_targets/cloud_run/deployment/terraform/dev/service_accounts.tf +20 -0
  97. src/deployment_targets/cloud_run/tests/integration/test_server_e2e.py +192 -0
  98. src/deployment_targets/cloud_run/tests/load_test/.results/.placeholder +0 -0
  99. src/deployment_targets/cloud_run/tests/load_test/README.md +79 -0
  100. src/deployment_targets/cloud_run/tests/load_test/load_test.py +85 -0
  101. src/deployment_targets/cloud_run/tests/unit/test_server.py +142 -0
  102. src/deployment_targets/cloud_run/uv.lock +6952 -0
  103. src/frontends/live_api_react/frontend/package-lock.json +19405 -0
  104. src/frontends/live_api_react/frontend/package.json +56 -0
  105. src/frontends/live_api_react/frontend/public/favicon.ico +0 -0
  106. src/frontends/live_api_react/frontend/public/index.html +62 -0
  107. src/frontends/live_api_react/frontend/public/robots.txt +3 -0
  108. src/frontends/live_api_react/frontend/src/App.scss +189 -0
  109. src/frontends/live_api_react/frontend/src/App.test.tsx +25 -0
  110. src/frontends/live_api_react/frontend/src/App.tsx +205 -0
  111. src/frontends/live_api_react/frontend/src/components/audio-pulse/AudioPulse.tsx +64 -0
  112. src/frontends/live_api_react/frontend/src/components/audio-pulse/audio-pulse.scss +68 -0
  113. src/frontends/live_api_react/frontend/src/components/control-tray/ControlTray.tsx +217 -0
  114. src/frontends/live_api_react/frontend/src/components/control-tray/control-tray.scss +201 -0
  115. src/frontends/live_api_react/frontend/src/components/logger/Logger.tsx +241 -0
  116. src/frontends/live_api_react/frontend/src/components/logger/logger.scss +133 -0
  117. src/frontends/live_api_react/frontend/src/components/logger/mock-logs.ts +151 -0
  118. src/frontends/live_api_react/frontend/src/components/side-panel/SidePanel.tsx +161 -0
  119. src/frontends/live_api_react/frontend/src/components/side-panel/side-panel.scss +285 -0
  120. src/frontends/live_api_react/frontend/src/contexts/LiveAPIContext.tsx +48 -0
  121. src/frontends/live_api_react/frontend/src/hooks/use-live-api.ts +115 -0
  122. src/frontends/live_api_react/frontend/src/hooks/use-media-stream-mux.ts +23 -0
  123. src/frontends/live_api_react/frontend/src/hooks/use-screen-capture.ts +72 -0
  124. src/frontends/live_api_react/frontend/src/hooks/use-webcam.ts +69 -0
  125. src/frontends/live_api_react/frontend/src/index.css +28 -0
  126. src/frontends/live_api_react/frontend/src/index.tsx +35 -0
  127. src/frontends/live_api_react/frontend/src/multimodal-live-types.ts +242 -0
  128. src/frontends/live_api_react/frontend/src/react-app-env.d.ts +17 -0
  129. src/frontends/live_api_react/frontend/src/reportWebVitals.ts +31 -0
  130. src/frontends/live_api_react/frontend/src/setupTests.ts +21 -0
  131. src/frontends/live_api_react/frontend/src/utils/audio-recorder.ts +111 -0
  132. src/frontends/live_api_react/frontend/src/utils/audio-streamer.ts +270 -0
  133. src/frontends/live_api_react/frontend/src/utils/audioworklet-registry.ts +43 -0
  134. src/frontends/live_api_react/frontend/src/utils/multimodal-live-client.ts +329 -0
  135. src/frontends/live_api_react/frontend/src/utils/store-logger.ts +64 -0
  136. src/frontends/live_api_react/frontend/src/utils/utils.ts +86 -0
  137. src/frontends/live_api_react/frontend/src/utils/worklets/audio-processing.ts +73 -0
  138. src/frontends/live_api_react/frontend/src/utils/worklets/vol-meter.ts +65 -0
  139. src/frontends/live_api_react/frontend/tsconfig.json +25 -0
  140. src/frontends/streamlit/frontend/side_bar.py +213 -0
  141. src/frontends/streamlit/frontend/streamlit_app.py +263 -0
  142. src/frontends/streamlit/frontend/style/app_markdown.py +37 -0
  143. src/frontends/streamlit/frontend/utils/chat_utils.py +67 -0
  144. src/frontends/streamlit/frontend/utils/local_chat_history.py +125 -0
  145. src/frontends/streamlit/frontend/utils/message_editing.py +59 -0
  146. src/frontends/streamlit/frontend/utils/multimodal_utils.py +217 -0
  147. src/frontends/streamlit/frontend/utils/stream_handler.py +282 -0
  148. src/frontends/streamlit/frontend/utils/title_summary.py +77 -0
  149. src/resources/containers/data_processing/Dockerfile +25 -0
  150. src/resources/locks/uv-agentic_rag_vertexai_search-agent_engine.lock +4684 -0
  151. src/resources/locks/uv-agentic_rag_vertexai_search-cloud_run.lock +5799 -0
  152. src/resources/locks/uv-crewai_coding_crew-agent_engine.lock +5509 -0
  153. src/resources/locks/uv-crewai_coding_crew-cloud_run.lock +6688 -0
  154. src/resources/locks/uv-langgraph_base_react-agent_engine.lock +4595 -0
  155. src/resources/locks/uv-langgraph_base_react-cloud_run.lock +5710 -0
  156. src/resources/locks/uv-multimodal_live_api-cloud_run.lock +5665 -0
  157. src/resources/setup_cicd/cicd_variables.tf +36 -0
  158. src/resources/setup_cicd/github.tf +85 -0
  159. src/resources/setup_cicd/providers.tf +39 -0
  160. src/utils/generate_locks.py +135 -0
  161. src/utils/lock_utils.py +82 -0
  162. src/utils/watch_and_rebuild.py +190 -0
@@ -0,0 +1,79 @@
1
+ # Data Ingestion Pipeline
2
+
3
+ This pipeline automates the ingestion of data into Vertex AI Search, streamlining the process of building Retrieval Augmented Generation (RAG) applications.
4
+
5
+ It orchestrates the complete workflow: loading data, chunking it into manageable segments, generating embeddings using Vertex AI Embeddings, and importing the processed data into your Vertex AI Search datastore.
6
+
7
+ You can trigger the pipeline for an initial data load or schedule it to run periodically, ensuring your search index remains current. Vertex AI Pipelines provides the orchestration and monitoring capabilities for this process.
8
+
9
+ ## Prerequisites
10
+
11
+ Before running the data ingestion pipeline, ensure you have completed the following:
12
+
13
+ 1. **Set up Dev Terraform:** Follow the instructions in the parent [deployment/README.md - Dev Deployment section](../deployment/README.md#dev-deployment) to provision the necessary resources in your development environment using Terraform. This includes deploying a datastore and configuring the required permissions.
14
+
15
+ ## Running the Data Ingestion Pipeline
16
+
17
+ After setting up the Terraform infrastructure, you can test the data ingestion pipeline.
18
+
19
+ > **Note:** The initial pipeline execution might take longer as your project is configured for Vertex AI Pipelines.
20
+
21
+ **Steps:**
22
+
23
+ **a. Navigate to the `data_ingestion` directory:**
24
+
25
+ ```bash
26
+ cd data_ingestion
27
+ ```
28
+
29
+ **b. Install Dependencies:**
30
+
31
+ Install the required Python dependencies using uv:
32
+
33
+ ```bash
34
+ uv sync --frozen
35
+ ```
36
+
37
+ **c. Execute the Pipeline:**
38
+
39
+ Run the following command to execute the data ingestion pipeline. Replace the placeholder values with your actual project details.
40
+
41
+ ```bash
42
+ PROJECT_ID="YOUR_PROJECT_ID"
43
+ REGION="us-central1"
44
+ DATA_STORE_REGION="us"
45
+ uv run data_ingestion_pipeline/submit_pipeline.py \
46
+ --project-id=$PROJECT_ID \
47
+ --region=$REGION \
48
+ --data-store-region=$DATA_STORE_REGION \
49
+ --data-store-id="sample-datastore" \
50
+ --service-account="vertexai-pipelines-sa@$PROJECT_ID.iam.gserviceaccount.com" \
51
+ --pipeline-root="gs://$PROJECT_ID-pipeline-artifacts" \
52
+ --pipeline-name="data-ingestion-pipeline"
53
+ ```
54
+
55
+ **Parameter Explanation:**
56
+
57
+ * `--project-id`: Your Google Cloud project ID.
58
+ * `--region`: The region where Vertex AI Pipelines will run (e.g., `us-central1`).
59
+ * `--data-store-region`: The region for Vertex AI Search operations (e.g., `us` or `eu`).
60
+ * `--data-store-id`: The ID of your Vertex AI Search datastore.
61
+ * `--service-account`: The service account email used for pipeline execution. Ensure this service account has the necessary permissions (e.g., Vertex AI User, Storage Object Admin).
62
+ * `--pipeline-root`: The Google Cloud Storage (GCS) bucket for storing pipeline artifacts.
63
+ * `--pipeline-name`: A descriptive name for your pipeline.
64
+ * `--schedule-only` (Optional): If specified, the pipeline will only be scheduled and not executed immediately. Requires `--cron-schedule`.
65
+ * `--cron-schedule` (Optional): A cron expression defining the pipeline's schedule (e.g., `"0 9 * * 1"` for every Monday at 9:00 AM UTC).
66
+
67
+ **d. Pipeline Scheduling and Execution:**
68
+
69
+ The pipeline, by default, executes immediately. To schedule the pipeline for periodic execution without immediate initiation, use the `--schedule-only` flag in conjunction with `--cron-schedule`. If a schedule doesn't exist, it will be created. If a schedule already exists, its cron expression will be updated to the provided value.
70
+
71
+ **e. Monitoring Pipeline Progress:**
72
+
73
+ The pipeline's configuration and execution status will be printed to the console. For detailed monitoring, use the Vertex AI Pipelines dashboard in the Google Cloud Console. This dashboard provides real-time insights into the pipeline's progress, logs, and any potential issues.
74
+
75
+ ## Testing Your RAG Application
76
+
77
+ Once the data ingestion pipeline completes successfully, you can test your RAG application with Vertex AI Search.
78
+
79
+ > **Troubleshooting:** If you encounter the error `"google.api_core.exceptions.InvalidArgument: 400 The embedding field path: embedding not found in schema"` after the initial data ingestion, wait a few minutes and try again. This delay allows Vertex AI Search to fully index the ingested data.
@@ -0,0 +1,175 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from kfp.dsl import Dataset, Input, component
16
+
17
+
18
+ @component(
19
+ base_image="us-docker.pkg.dev/production-ai-template/starter-pack/data_processing:0.1"
20
+ )
21
+ def ingest_data(
22
+ project_id: str,
23
+ data_store_region: str,
24
+ input_files: Input[Dataset],
25
+ data_store_id: str,
26
+ embedding_dimension: int = 768,
27
+ embedding_column: str = "embedding",
28
+ ) -> None:
29
+ """Process and ingest documents into Vertex AI Search datastore.
30
+
31
+ Args:
32
+ project_id: Google Cloud project ID
33
+ data_store_region: Region for Vertex AI Search
34
+ input_files: Input dataset containing documents
35
+ data_store_id: ID of target datastore
36
+ embedding_column: Name of embedding column in schema
37
+ """
38
+ import json
39
+ import logging
40
+
41
+ from google.api_core.client_options import ClientOptions
42
+ from google.cloud import discoveryengine
43
+
44
+ def update_schema_as_json(
45
+ original_schema: str,
46
+ embedding_dimension: int,
47
+ field_name: str | None = None,
48
+ ) -> str:
49
+ """Update datastore schema JSON to include embedding field.
50
+
51
+ Args:
52
+ original_schema: Original schema JSON string
53
+ field_name: Name of embedding field to add
54
+
55
+ Returns:
56
+ Updated schema JSON string
57
+ """
58
+ original_schema_dict = json.loads(original_schema)
59
+
60
+ if original_schema_dict.get("properties") is None:
61
+ original_schema_dict["properties"] = {}
62
+
63
+ if field_name:
64
+ field_schema = {
65
+ "type": "array",
66
+ "keyPropertyMapping": "embedding_vector",
67
+ "dimension": embedding_dimension,
68
+ "items": {"type": "number"},
69
+ }
70
+ original_schema_dict["properties"][field_name] = field_schema
71
+
72
+ return json.dumps(original_schema_dict)
73
+
74
+ def update_data_store_schema(
75
+ project_id: str,
76
+ location: str,
77
+ data_store_id: str,
78
+ field_name: str | None = None,
79
+ client_options: ClientOptions | None = None,
80
+ ) -> None:
81
+ """Update datastore schema to include embedding field.
82
+
83
+ Args:
84
+ project_id: Google Cloud project ID
85
+ location: Google Cloud location
86
+ data_store_id: Target datastore ID
87
+ embedding_column: Name of embedding column
88
+ client_options: Client options for API
89
+ """
90
+ schema_client = discoveryengine.SchemaServiceClient(
91
+ client_options=client_options
92
+ )
93
+ collection = "default_collection"
94
+
95
+ name = f"projects/{project_id}/locations/{location}/collections/{collection}/dataStores/{data_store_id}/schemas/default_schema"
96
+
97
+ schema = schema_client.get_schema(
98
+ request=discoveryengine.GetSchemaRequest(name=name)
99
+ )
100
+ new_schema_json = update_schema_as_json(
101
+ original_schema=schema.json_schema,
102
+ embedding_dimension=embedding_dimension,
103
+ field_name=field_name,
104
+ )
105
+ new_schema = discoveryengine.Schema(json_schema=new_schema_json, name=name)
106
+
107
+ operation = schema_client.update_schema(
108
+ request=discoveryengine.UpdateSchemaRequest(
109
+ schema=new_schema, allow_missing=True
110
+ )
111
+ )
112
+ logging.info(f"Waiting for schema update operation: {operation.operation.name}")
113
+ operation.result()
114
+
115
+ def add_data_in_store(
116
+ project_id: str,
117
+ location: str,
118
+ data_store_id: str,
119
+ input_files_uri: str,
120
+ client_options: ClientOptions | None = None,
121
+ ) -> None:
122
+ """Import documents into datastore.
123
+
124
+ Args:
125
+ project_id: Google Cloud project ID
126
+ location: Google Cloud location
127
+ data_store_id: Target datastore ID
128
+ input_files_uri: URI of input files
129
+ client_options: Client options for API
130
+ """
131
+ client = discoveryengine.DocumentServiceClient(client_options=client_options)
132
+
133
+ parent = client.branch_path(
134
+ project=project_id,
135
+ location=location,
136
+ data_store=data_store_id,
137
+ branch="default_branch",
138
+ )
139
+
140
+ request = discoveryengine.ImportDocumentsRequest(
141
+ parent=parent,
142
+ gcs_source=discoveryengine.GcsSource(
143
+ input_uris=[input_files_uri],
144
+ data_schema="document",
145
+ ),
146
+ reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.FULL,
147
+ )
148
+
149
+ operation = client.import_documents(request=request)
150
+ logging.info(f"Waiting for import operation: {operation.operation.name}")
151
+ operation.result()
152
+
153
+ client_options = ClientOptions(
154
+ api_endpoint=f"{data_store_region}-discoveryengine.googleapis.com"
155
+ )
156
+
157
+ logging.info("Updating data store schema...")
158
+ update_data_store_schema(
159
+ project_id=project_id,
160
+ location=data_store_region,
161
+ data_store_id=data_store_id,
162
+ field_name=embedding_column,
163
+ client_options=client_options,
164
+ )
165
+ logging.info("Schema updated successfully")
166
+
167
+ logging.info("Importing data into store...")
168
+ add_data_in_store(
169
+ project_id=project_id,
170
+ location=data_store_region,
171
+ data_store_id=data_store_id,
172
+ client_options=client_options,
173
+ input_files_uri=input_files.uri,
174
+ )
175
+ logging.info("Data import completed")
@@ -0,0 +1,321 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ This component is derived from the notebook:
17
+ https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented_generation/scalable_rag_with_bigframes.ipynb
18
+
19
+ It leverages BigQuery for data processing. We also suggest looking at remote functions for enhanced scalability.
20
+ """
21
+
22
+ from kfp.dsl import Dataset, Output, component
23
+
24
+
25
+ @component(
26
+ base_image="us-docker.pkg.dev/production-ai-template/starter-pack/data_processing:0.1"
27
+ )
28
+ def process_data(
29
+ project_id: str,
30
+ schedule_time: str,
31
+ output_files: Output[Dataset],
32
+ is_incremental: bool = True,
33
+ look_back_days: int = 1,
34
+ chunk_size: int = 1500,
35
+ chunk_overlap: int = 20,
36
+ destination_dataset: str = "stackoverflow_data",
37
+ destination_table: str = "incremental_questions_embeddings",
38
+ deduped_table: str = "questions_embeddings",
39
+ location: str = "us-central1",
40
+ embedding_column: str = "embedding",
41
+ ) -> None:
42
+ """Process StackOverflow questions and answers by:
43
+ 1. Fetching data from BigQuery
44
+ 2. Converting HTML to markdown
45
+ 3. Splitting text into chunks
46
+ 4. Generating embeddings
47
+ 5. Storing results in BigQuery
48
+ 6. Exporting to JSONL
49
+
50
+ Args:
51
+ output_files: Output dataset path
52
+ is_incremental: Whether to process only recent data
53
+ look_back_days: Number of days to look back for incremental processing
54
+ chunk_size: Size of text chunks
55
+ chunk_overlap: Overlap between chunks
56
+ destination_dataset: BigQuery dataset for storing results
57
+ destination_table: Table for storing incremental results
58
+ deduped_table: Table for storing deduplicated results
59
+ location: BigQuery location
60
+ """
61
+ import logging
62
+ from datetime import datetime, timedelta
63
+
64
+ import backoff
65
+ import bigframes.ml.llm as llm
66
+ import bigframes.pandas as bpd
67
+ import google.api_core.exceptions
68
+ import swifter
69
+ from google.cloud import bigquery
70
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
71
+ from markdownify import markdownify
72
+
73
+ # Initialize logging
74
+ logging.basicConfig(level=logging.INFO)
75
+ logging.info(f"Using {swifter} for apply operations.")
76
+
77
+ # Initialize clients
78
+ logging.info("Initializing clients...")
79
+ bq_client = bigquery.Client(project=project_id, location=location)
80
+ bpd.options.bigquery.project = project_id
81
+ bpd.options.bigquery.location = location
82
+ logging.info("Clients initialized.")
83
+
84
+ # Set date range for data fetch
85
+ schedule_time_dt: datetime = datetime.fromisoformat(
86
+ schedule_time.replace("Z", "+00:00")
87
+ )
88
+ if schedule_time_dt.year == 1970:
89
+ logging.warning(
90
+ "Pipeline schedule not set. Setting schedule_time to current date."
91
+ )
92
+ schedule_time_dt = datetime.now()
93
+
94
+ # Note: The following line sets the schedule time 5 years back to allow sample data to be present.
95
+ # For your use case, please comment out the following line to use the actual schedule time.
96
+ schedule_time_dt = schedule_time_dt - timedelta(days=5 * 365)
97
+
98
+ START_DATE: datetime = schedule_time_dt - timedelta(
99
+ days=look_back_days
100
+ ) # Start date for data processing window
101
+ END_DATE: datetime = schedule_time_dt # End date for data processing window
102
+
103
+ logging.info(f"Date range set: START_DATE={START_DATE}, END_DATE={END_DATE}")
104
+
105
+ def fetch_stackoverflow_data(
106
+ dataset_suffix: str, start_date: str, end_date: str
107
+ ) -> bpd.DataFrame:
108
+ """Fetch StackOverflow data from BigQuery."""
109
+ query = f"""
110
+ SELECT
111
+ creation_date,
112
+ last_edit_date,
113
+ question_id,
114
+ question_title,
115
+ question_body AS question_text,
116
+ answers
117
+ FROM `production-ai-template.stackoverflow_qa_{dataset_suffix}.stackoverflow_python_questions_and_answers`
118
+ WHERE TRUE
119
+ {f'AND TIMESTAMP_TRUNC(creation_date, DAY) BETWEEN TIMESTAMP("{start_date}") AND TIMESTAMP("{end_date}")' if is_incremental else ""}
120
+ """
121
+ logging.info("Fetching StackOverflow data from BigQuery...")
122
+ return bpd.read_gbq(query)
123
+
124
+ def convert_html_to_markdown(html: str) -> str:
125
+ """Convert HTML into Markdown for easier parsing and rendering after LLM response."""
126
+ return markdownify(html).strip()
127
+
128
+ def create_answers_markdown(answers: list) -> str:
129
+ """Convert each answer's HTML to markdown and concatenate into a single markdown text."""
130
+ answers_md = ""
131
+ for index, answer_record in enumerate(answers):
132
+ answers_md += (
133
+ f"\n\n## Answer {index + 1}:\n" # Answer number is H2 heading size
134
+ )
135
+ answers_md += convert_html_to_markdown(answer_record["body"])
136
+ return answers_md
137
+
138
+ def create_table_if_not_exist(
139
+ df: bpd.DataFrame,
140
+ project_id: str,
141
+ dataset_id: str,
142
+ table_id: str,
143
+ partition_column: str,
144
+ location: str = location,
145
+ ) -> None:
146
+ """Create BigQuery table with time partitioning if it doesn't exist."""
147
+ table_schema = bq_client.get_table(df.head(0).to_gbq()).schema
148
+ table = bigquery.Table(
149
+ f"{project_id}.{dataset_id}.{table_id}", schema=table_schema
150
+ )
151
+ table.time_partitioning = bigquery.TimePartitioning(
152
+ type_=bigquery.TimePartitioningType.DAY, field=partition_column
153
+ )
154
+
155
+ dataset = bigquery.Dataset(f"{project_id}.{dataset_id}")
156
+ dataset.location = location
157
+ bq_client.create_dataset(dataset, exists_ok=True)
158
+ bq_client.create_table(table=table, exists_ok=True)
159
+
160
+ # Fetch and preprocess data
161
+ logging.info("Fetching and preprocessing data...")
162
+ df = fetch_stackoverflow_data(
163
+ start_date=START_DATE.strftime("%Y-%m-%d"),
164
+ end_date=END_DATE.strftime("%Y-%m-%d"),
165
+ dataset_suffix=location.lower().replace("-", "_"),
166
+ )
167
+ df = (
168
+ df.sort_values("last_edit_date", ascending=False)
169
+ .drop_duplicates("question_id")
170
+ .reset_index(drop=True)
171
+ )
172
+ logging.info("Data fetched and preprocessed.")
173
+
174
+ # Convert content to markdown
175
+ logging.info("Converting content to markdown...")
176
+
177
+ # Create markdown fields efficiently
178
+ df["question_title_md"] = (
179
+ "# " + df["question_title"] + "\n"
180
+ ) # Title is H1 heading size
181
+ df["question_text_md"] = (
182
+ df["question_text"].to_pandas().swifter.apply(convert_html_to_markdown) + "\n"
183
+ )
184
+ df["answers_md"] = df["answers"].to_pandas().swifter.apply(create_answers_markdown)
185
+
186
+ # Create a column containing the whole markdown text
187
+ df["full_text_md"] = (
188
+ df["question_title_md"] + df["question_text_md"] + df["answers_md"]
189
+ )
190
+ logging.info("Content converted to markdown.")
191
+
192
+ # Keep only necessary columns
193
+ df = df[["last_edit_date", "question_id", "question_text", "full_text_md"]]
194
+
195
+ # Split text into chunks
196
+ logging.info("Splitting text into chunks...")
197
+ text_splitter = RecursiveCharacterTextSplitter(
198
+ chunk_size=chunk_size,
199
+ chunk_overlap=chunk_overlap,
200
+ length_function=len,
201
+ )
202
+
203
+ df["text_chunk"] = (
204
+ df["full_text_md"]
205
+ .to_pandas()
206
+ .astype(object)
207
+ .swifter.apply(text_splitter.split_text)
208
+ )
209
+ logging.info("Text split into chunks.")
210
+
211
+ # Create chunk IDs and explode chunks into rows
212
+ logging.info("Creating chunk IDs and exploding chunks into rows...")
213
+ chunk_ids = [
214
+ str(idx) for text_chunk in df["text_chunk"] for idx in range(len(text_chunk))
215
+ ]
216
+ df = df.explode("text_chunk").reset_index(drop=True)
217
+ df["chunk_id"] = df["question_id"].astype("string") + "__" + chunk_ids
218
+ logging.info("Chunk IDs created and chunks exploded.")
219
+
220
+ # Generate embeddings
221
+ logging.info("Generating embeddings...")
222
+
223
+ # The first invocation in a new project might fail due to permission propagation.
224
+ @backoff.on_exception(
225
+ backoff.expo, google.api_core.exceptions.InvalidArgument, max_tries=10
226
+ )
227
+ def create_embedder() -> llm.TextEmbeddingGenerator:
228
+ return llm.TextEmbeddingGenerator(model_name="text-embedding-005")
229
+
230
+ embedder = create_embedder()
231
+
232
+ embeddings_df = embedder.predict(df["text_chunk"])
233
+ logging.info("Embeddings generated.")
234
+
235
+ df = df.assign(
236
+ embedding=embeddings_df["ml_generate_embedding_result"],
237
+ embedding_statistics=embeddings_df["ml_generate_embedding_statistics"],
238
+ embedding_status=embeddings_df["ml_generate_embedding_status"],
239
+ creation_timestamp=datetime.now(),
240
+ )
241
+
242
+ # Store results in BigQuery
243
+ PARTITION_DATE_COLUMN = "creation_timestamp"
244
+
245
+ # Create and populate incremental table
246
+ logging.info("Creating and populating incremental table...")
247
+ create_table_if_not_exist(
248
+ df=df,
249
+ project_id=project_id,
250
+ dataset_id=destination_dataset,
251
+ table_id=destination_table,
252
+ partition_column=PARTITION_DATE_COLUMN,
253
+ )
254
+
255
+ if_exists_mode = "append" if is_incremental else "replace"
256
+ df.to_gbq(
257
+ destination_table=f"{destination_dataset}.{destination_table}",
258
+ if_exists=if_exists_mode,
259
+ )
260
+ logging.info("Incremental table created and populated.")
261
+
262
+ # Create deduplicated table
263
+ logging.info("Creating deduplicated table...")
264
+ df_questions = bpd.read_gbq(
265
+ f"{destination_dataset}.{destination_table}", use_cache=False
266
+ )
267
+ max_date_df = (
268
+ df_questions.groupby("question_id")["creation_timestamp"].max().reset_index()
269
+ )
270
+ df_questions_dedup = max_date_df.merge(
271
+ df_questions, how="inner", on=["question_id", "creation_timestamp"]
272
+ )
273
+
274
+ create_table_if_not_exist(
275
+ df=df_questions_dedup,
276
+ project_id=project_id,
277
+ dataset_id=destination_dataset,
278
+ table_id=deduped_table,
279
+ partition_column=PARTITION_DATE_COLUMN,
280
+ )
281
+
282
+ df_questions_dedup.to_gbq(
283
+ destination_table=f"{destination_dataset}.{deduped_table}",
284
+ if_exists="replace",
285
+ )
286
+ logging.info("Deduplicated table created and populated.")
287
+
288
+ # Export to JSONL
289
+ logging.info("Exporting to JSONL...")
290
+
291
+ export_query = f"""
292
+ SELECT
293
+ chunk_id as id,
294
+ TO_JSON_STRING(STRUCT(
295
+ chunk_id as id,
296
+ embedding as {embedding_column},
297
+ text_chunk as content,
298
+ question_id,
299
+ CAST(creation_timestamp AS STRING) as creation_timestamp,
300
+ CAST(last_edit_date AS STRING) as last_edit_date,
301
+ question_text,
302
+ full_text_md
303
+ )) as json_data
304
+ FROM
305
+ `{project_id}.{destination_dataset}.{deduped_table}`
306
+ WHERE
307
+ chunk_id IS NOT NULL
308
+ AND embedding IS NOT NULL
309
+ """
310
+ export_df_id = bpd.read_gbq(export_query).to_gbq()
311
+
312
+ output_files.uri = output_files.uri + "*.jsonl"
313
+
314
+ job_config = bigquery.ExtractJobConfig()
315
+ job_config.destination_format = bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON
316
+
317
+ extract_job = bq_client.extract_table(
318
+ export_df_id, output_files.uri, job_config=job_config
319
+ )
320
+ extract_job.result()
321
+ logging.info("Exported to JSONL.")
@@ -0,0 +1,58 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from data_ingestion_pipeline.components.ingest_data import ingest_data
16
+ from data_ingestion_pipeline.components.process_data import process_data
17
+ from kfp import dsl
18
+
19
+
20
+ @dsl.pipeline(description="A pipeline to run ingestion of new data into the datastore")
21
+ def pipeline(
22
+ project_id: str,
23
+ location: str,
24
+ data_store_region: str,
25
+ data_store_id: str,
26
+ is_incremental: bool = True,
27
+ look_back_days: int = 1,
28
+ chunk_size: int = 1500,
29
+ chunk_overlap: int = 20,
30
+ destination_dataset: str = "stackoverflow_data",
31
+ destination_table: str = "incremental_questions_embeddings",
32
+ deduped_table: str = "questions_embeddings",
33
+ ) -> None:
34
+ """Processes data and ingests it into a datastore for RAG Retrieval"""
35
+
36
+ # Process the data and generate embeddings
37
+ processed_data = process_data(
38
+ project_id=project_id,
39
+ schedule_time=dsl.PIPELINE_JOB_SCHEDULE_TIME_UTC_PLACEHOLDER,
40
+ is_incremental=is_incremental,
41
+ look_back_days=look_back_days,
42
+ chunk_size=chunk_size,
43
+ chunk_overlap=chunk_overlap,
44
+ destination_dataset=destination_dataset,
45
+ destination_table=destination_table,
46
+ deduped_table=deduped_table,
47
+ location=location,
48
+ embedding_column="embedding",
49
+ )
50
+
51
+ # Ingest the processed data into Vertex AI Search datastore
52
+ ingest_data(
53
+ project_id=project_id,
54
+ data_store_region=data_store_region,
55
+ input_files=processed_data.output,
56
+ data_store_id=data_store_id,
57
+ embedding_column="embedding",
58
+ )