@san-francisco/sf-docs-embeddings 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ The San Francisco License (SF License)
2
+
3
+ Copyright (c) San Francisco License Contributors 2026
4
+
5
+ Permission is granted, free of charge, to any person or organization
6
+ obtaining a copy of this software and associated documentation files
7
+ (the "Software"), to use, copy, modify, merge, publish, distribute,
8
+ sublicense, and/or sell copies of the Software, and to permit others
9
+ to whom the Software is furnished to do the same.
10
+
11
+ The only requirement is that this license notice and copyright notice
12
+ shall be included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT.
17
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18
+ CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19
+ TORT, OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION WITH THE
20
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
package/Makefile ADDED
@@ -0,0 +1,6 @@
1
+ EXTENSION = sf-docs-embeddings
2
+ DATA = sql/sf-docs-embeddings--0.0.1.sql
3
+
4
+ PG_CONFIG = pg_config
5
+ PGXS := $(shell $(PG_CONFIG) --pgxs)
6
+ include $(PGXS)
package/README.md ADDED
@@ -0,0 +1,59 @@
1
+ # @san-francisco/sf-docs-embeddings
2
+
3
+ San Francisco documentation embeddings - example RAG data package.
4
+
5
+ ## Overview
6
+
7
+ This package demonstrates how to ship pre-computed embeddings as PGPM migrations. It creates a sample collection with San Francisco city documentation and example embeddings.
8
+
9
+ ## What This Package Does
10
+
11
+ 1. Creates an embedding model configuration for `text-embedding-3-small`
12
+ 2. Creates the `sf-docs` collection with semantic chunking config
13
+ 3. Seeds example documents and chunks
14
+ 4. (In production) Would include actual vector embeddings
15
+
16
+ ## Usage
17
+
18
+ ```bash
19
+ # Install the RAG schema first
20
+ pgpm deploy @sf-bot/rag-core
21
+
22
+ # Then install this data package
23
+ pgpm deploy @san-francisco/sf-docs-embeddings
24
+ ```
25
+
26
+ ## Data Structure
27
+
28
+ After installation, you'll have:
29
+
30
+ - **Collection**: `sf-docs` - San Francisco city documentation
31
+ - **Model**: `text-embedding-3-small` (OpenAI, 1536 dimensions)
32
+ - **Documents**: Example SF city services content
33
+ - **Chunks**: Semantically chunked document segments
34
+
35
+ ## Creating Your Own Data Package
36
+
37
+ To create a similar data package for your own embeddings:
38
+
39
+ 1. Generate embeddings using your preferred model
40
+ 2. Export using `rag.export_collection_json()`
41
+ 3. Convert the JSON to SQL INSERT statements
42
+ 4. Package as a PGPM module
43
+
44
+ Example workflow:
45
+ ```sql
46
+ -- Export your collection
47
+ SELECT rag.export_collection_json('your-collection-id');
48
+
49
+ -- Or export as CSV for processing
50
+ SELECT * FROM rag.export_embeddings_csv('your-collection-id');
51
+ ```
52
+
53
+ ## Dependencies
54
+
55
+ - `@sf-bot/rag-core`
56
+
57
+ ## License
58
+
59
+ [SF License](https://github.com/city-of-san-francisco/license)
@@ -0,0 +1,93 @@
1
+ -- Deploy data/seed_collection to pg
2
+ -- made with <3 @ constructive.io
3
+
4
+ -- requires: @sf-bot/rag-core
5
+
6
+ -- This is an example data package showing how to ship embeddings as migrations.
7
+ -- In production, this would contain actual embeddings generated from SF documentation.
8
+
9
+ BEGIN;
10
+
11
+ -- Insert embedding model (if not exists)
12
+ INSERT INTO rag.embedding_model (name, provider, dimensions, metadata)
13
+ VALUES (
14
+ 'text-embedding-3-small',
15
+ 'openai',
16
+ 1536,
17
+ '{"description": "OpenAI text-embedding-3-small model"}'::jsonb
18
+ )
19
+ ON CONFLICT (name) DO NOTHING;
20
+
21
+ -- Create the SF Docs collection
22
+ INSERT INTO rag.collection (name, description, chunk_config, metadata)
23
+ VALUES (
24
+ 'sf-docs',
25
+ 'San Francisco city documentation and public information',
26
+ '{"strategy": "semantic", "max_tokens": 512, "overlap_tokens": 50}'::jsonb,
27
+ '{"source": "sf.gov", "version": "2026.01"}'::jsonb
28
+ )
29
+ ON CONFLICT (name) DO NOTHING;
30
+
31
+ -- Link collection to model
32
+ INSERT INTO rag.collection_model (collection_id, model_id, is_default)
33
+ SELECT c.id, m.id, true
34
+ FROM rag.collection c, rag.embedding_model m
35
+ WHERE c.name = 'sf-docs' AND m.name = 'text-embedding-3-small'
36
+ ON CONFLICT (collection_id, model_id) DO NOTHING;
37
+
38
+ -- Example document: SF City Services Overview
39
+ -- In production, this would be populated with actual content and embeddings
40
+ INSERT INTO rag.document (collection_id, source_uri, source_type, content, content_hash, metadata, status)
41
+ SELECT
42
+ c.id,
43
+ 'https://sf.gov/services',
44
+ 'url',
45
+ 'San Francisco provides a wide range of city services to residents and visitors. These include public transportation through MUNI and BART, parks and recreation facilities, public libraries, and emergency services. The city also offers various permits and licenses for businesses and residents.',
46
+ 'sha256:example_sf_services_001',
47
+ '{"title": "SF City Services Overview", "category": "services"}'::jsonb,
48
+ 'embedded'
49
+ FROM rag.collection c
50
+ WHERE c.name = 'sf-docs'
51
+ ON CONFLICT (collection_id, content_hash) DO NOTHING;
52
+
53
+ -- Example chunks for the document
54
+ -- Note: In production, these would be generated by a chunking algorithm
55
+ WITH doc AS (
56
+ SELECT d.id FROM rag.document d
57
+ JOIN rag.collection c ON c.id = d.collection_id
58
+ WHERE c.name = 'sf-docs' AND d.content_hash = 'sha256:example_sf_services_001'
59
+ )
60
+ INSERT INTO rag.chunk (document_id, content, chunk_index, start_offset, end_offset, token_count, metadata)
61
+ SELECT
62
+ doc.id,
63
+ 'San Francisco provides a wide range of city services to residents and visitors. These include public transportation through MUNI and BART, parks and recreation facilities, public libraries, and emergency services.',
64
+ 0,
65
+ 0,
66
+ 220,
67
+ 42,
68
+ '{"section": "introduction"}'::jsonb
69
+ FROM doc
70
+ ON CONFLICT (document_id, chunk_index) DO NOTHING;
71
+
72
+ WITH doc AS (
73
+ SELECT d.id FROM rag.document d
74
+ JOIN rag.collection c ON c.id = d.collection_id
75
+ WHERE c.name = 'sf-docs' AND d.content_hash = 'sha256:example_sf_services_001'
76
+ )
77
+ INSERT INTO rag.chunk (document_id, content, chunk_index, start_offset, end_offset, token_count, metadata)
78
+ SELECT
79
+ doc.id,
80
+ 'The city also offers various permits and licenses for businesses and residents.',
81
+ 1,
82
+ 221,
83
+ 300,
84
+ 14,
85
+ '{"section": "permits"}'::jsonb
86
+ FROM doc
87
+ ON CONFLICT (document_id, chunk_index) DO NOTHING;
88
+
89
+ -- Example embeddings (placeholder vectors - in production these would be real embeddings)
90
+ -- Using 3-dimensional vectors for this example; real embeddings would be 1536-dimensional
91
+ -- Note: The actual embedding insertion would use the full 1536-dim vectors
92
+
93
+ COMMIT;
package/package.json ADDED
@@ -0,0 +1,44 @@
1
+ {
2
+ "name": "@san-francisco/sf-docs-embeddings",
3
+ "version": "0.0.3",
4
+ "author": "Dan Lynch <pyramation@gmail.com>",
5
+ "description": "San Francisco documentation embeddings - example RAG data package",
6
+ "homepage": "https://github.com/constructive-io/sf-bot",
7
+ "license": "MIT",
8
+ "publishConfig": {
9
+ "access": "public"
10
+ },
11
+ "repository": {
12
+ "type": "git",
13
+ "url": "https://github.com/constructive-io/sf-bot"
14
+ },
15
+ "bugs": {
16
+ "url": "https://github.com/constructive-io/sf-bot/issues"
17
+ },
18
+ "files": [
19
+ "deploy",
20
+ "verify",
21
+ "revert",
22
+ "pgpm.plan",
23
+ "*.control",
24
+ "Makefile"
25
+ ],
26
+ "scripts": {
27
+ "build": "true",
28
+ "clean": "true",
29
+ "lint": "eslint . --fix",
30
+ "test": "jest",
31
+ "test:watch": "jest --watchAll"
32
+ },
33
+ "keywords": [
34
+ "san-francisco",
35
+ "embeddings",
36
+ "rag",
37
+ "data"
38
+ ],
39
+ "devDependencies": {
40
+ "makage": "0.1.9",
41
+ "pgsql-test": "^2.18.15"
42
+ },
43
+ "gitHead": "4f7e6a0f13025df722f24602bfb1d173d04cf95c"
44
+ }
package/pgpm.plan ADDED
@@ -0,0 +1,5 @@
1
+ %syntax-version=1.0.0
2
+ %project=sf-docs-embeddings
3
+ %uri=sf-docs-embeddings
4
+
5
+ data/seed_collection 2026-01-25T01:10:00Z constructive <constructive@sf-bot> # seed SF docs collection with example data
@@ -0,0 +1,8 @@
1
+ -- Revert data/seed_collection from pg
2
+
3
+ BEGIN;
4
+
5
+ -- Remove the SF Docs collection and all related data (cascades to documents, chunks, embeddings)
6
+ DELETE FROM rag.collection WHERE name = 'sf-docs';
7
+
8
+ COMMIT;
@@ -0,0 +1,7 @@
1
+ # sf-docs-embeddings extension
2
+ comment = 'San Francisco documentation embeddings - example RAG data package'
3
+ default_version = '0.0.1'
4
+ module_pathname = '$libdir/sf-docs-embeddings'
5
+ requires = 'rag-core'
6
+ relocatable = false
7
+ superuser = false
@@ -0,0 +1,7 @@
1
+ -- Verify data/seed_collection on pg
2
+
3
+ BEGIN;
4
+
5
+ SELECT 1/count(*) FROM rag.collection WHERE name = 'sf-docs';
6
+
7
+ ROLLBACK;