@yottagraph-app/data-model-skill 0.0.30 → 0.0.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@yottagraph-app/data-model-skill",
3
- "version": "0.0.30",
3
+ "version": "0.0.32",
4
4
  "description": "Data model skill documentation for AI agents - entity types, properties, and schemas from Lovelace fetch sources",
5
5
  "repository": {
6
6
  "type": "git",
@@ -65,7 +65,7 @@ properties:
65
65
 
66
66
  - name: "serial_number"
67
67
  type: string
68
- description: "Complete aircraft serial number assigned by the manufacturer"
68
+ description: "Manufacturer serial number of the aircraft"
69
69
  display_name: "Serial Number"
70
70
  mergeability: not_mergeable
71
71
  domain_flavors: ["aircraft"]
@@ -376,6 +376,15 @@ relationships:
376
376
  examples: ["DELTA AIR LINES INC owns aircraft N12345"]
377
377
  passive: true
378
378
 
379
+ - name: "manufactures"
380
+ description: "An organization manufactures or produced an aircraft"
381
+ display_name: "Manufactures"
382
+ mergeability: not_mergeable
383
+ domain_flavors: ["organization"]
384
+ target_flavors: ["aircraft"]
385
+ examples: ["CESSNA manufactures CESSNA 172S (S/N 28-7990244)"]
386
+ passive: true
387
+
379
388
  - name: "is_located_at"
380
389
  description: "An entity is located at, operates in, resides in, is headquartered in, was born in, visits, or died in a location"
381
390
  display_name: "Located At"
@@ -0,0 +1,116 @@
1
+ # Data Dictionary: Patent Parents
2
+
3
+ ## Source Overview
4
+
5
+ The Patent Parents source aggregates US patent grant counts per assignee from the Google Patents Public Datasets BigQuery table (`patents-public-data.patents.publications`), identifies new assignees via diffing, and uses Vertex AI (Gemini) to assign corporate parent entities to each new patent-holding company.
6
+
7
+ The pipeline produces `subsidiary_of` relationship records linking child organizations (patent assignees) to their parent companies. No standalone organization records are emitted — only relationships.
8
+
9
+ | Stage | Description |
10
+ |-------|-------------|
11
+ | BigQuery aggregation | Counts US patents per assignee in a configurable `grant_date` window |
12
+ | Diffing | SHA-256 diff against download store; identifies new (unseen) assignees |
13
+ | LLM parent assignment | Vertex AI maps each new assignee to its corporate parent in batches of 100 |
14
+ | Atomization | Emits `subsidiary_of` relationships; skips self-referential (child == parent) rows |
15
+
16
+ | Pipeline | `Record.Source` |
17
+ |----------|----------------|
18
+ | All records | `patentparents` |
19
+
20
+ ---
21
+
22
+ ## Data Source
23
+
24
+ - **Table**: `patents-public-data.patents.publications` (Google BigQuery public dataset)
25
+ - **Fields used**: `assignee` (REPEATED STRING), `publication_number`, `country_code`, `grant_date`
26
+ - **Aggregation**: `COUNT(publication_number)` grouped by `UNNEST(assignee)`, filtered to `country_code = 'US'`
27
+ - **Assignee metadata available**: name and country_code only (no addresses, no corporate IDs)
28
+
29
+ ---
30
+
31
+ ## Entity Types
32
+
33
+ ### `organization`
34
+
35
+ A company, institution, or entity that holds US patent grants as an assignee.
36
+
37
+ - Entity resolver: named entity, MERGEABLE. No strong IDs (assignee names are not globally unique identifiers).
38
+ - Appears as both subject (child company) and target (parent company) in `patentparents::subsidiary_of` relationships.
39
+ - Resolver snippet includes patent count and grant date range (e.g., "Nokia Technologies Oy — 1095 US patents granted between 01/01/2025 and 04/01/2025").
40
+
41
+ ---
42
+
43
+ ## Relationships
44
+
45
+ ### `patentparents::subsidiary_of`
46
+
47
+ Links a patent-holding organization (child/subject) to its corporate parent (target).
48
+
49
+ - **Domain flavor**: `organization` (the patent assignee)
50
+ - **Target flavor**: `organization` (the parent company)
51
+ - **Derivation**: Vertex AI LLM analysis of patent assignee names. The LLM is prompted to assign a parent entity to each company; it may return the same name (self-referential) which is filtered out.
52
+ - **Citation**: `"{child} is a subsidiary of {parent}"`
53
+ - **Mergeability**: not_mergeable
54
+
55
+ ---
56
+
57
+ ## Properties
58
+
59
+ ### `patentparents::total_patents`
60
+
61
+ - **Type**: float
62
+ - **Definition**: Total number of US patent grants assigned to this organization in the scanned `grant_date` window.
63
+ - **Derivation**: `COUNT(publication_number)` from the BigQuery aggregation query.
64
+ - **Emitted on**: the child (assignee) organization in each `patentparents::subsidiary_of` record, when assignee data is available.
65
+
66
+ ### `patentparents::grant_date_from`
67
+
68
+ - **Type**: string (YYYY-MM-DD)
69
+ - **Definition**: Start of the `grant_date` window used in the BigQuery aggregation.
70
+ - **Emitted on**: the child (assignee) organization in each `patentparents::subsidiary_of` record, when assignee data is available.
71
+
72
+ ### `patentparents::grant_date_to`
73
+
74
+ - **Type**: string (YYYY-MM-DD)
75
+ - **Definition**: End of the `grant_date` window used in the BigQuery aggregation.
76
+ - **Emitted on**: the child (assignee) organization in each `patentparents::subsidiary_of` record, when assignee data is available.
77
+
78
+ ---
79
+
80
+ ## Configuration
81
+
82
+ | Arg | Type | Default | Description |
83
+ |-----|------|---------|-------------|
84
+ | `projectId` | string | (required) | GCP project for BigQuery and Vertex AI |
85
+ | `initialGrantDateMin` | int | (required) | YYYYMMDD lower bound for the grant_date window |
86
+ | `maxRows` | int | 0 | Max assignee rows from BigQuery (0 = unlimited); top N by patent count |
87
+ | `pollTimeMin` | int | 1440 | Poll interval in minutes |
88
+ | `batchSize` | int | 100 | Records per published FetchMessage |
89
+ | `llmModel` | string | `gemini-2.5-flash` | Vertex AI model for parent assignment |
90
+ | `vertexLocation` | string | `us-central1` | Vertex AI region |
91
+
92
+ ---
93
+
94
+ ## Pipeline Flow
95
+
96
+ ```
97
+ BigQuery (patents-public-data.patents.publications)
98
+ │ SELECT assignee, COUNT(publication_number) ... GROUP BY assignee
99
+
100
+ DiffingStreamer
101
+ │ WriteIfChanged → download/{sanitized_name}.json
102
+ │ scanKnownKeys → detect new vs existing assignees
103
+
104
+ New Companies List
105
+ │ Companies not previously in the download store
106
+
107
+ Vertex AI (batches of 100)
108
+ │ Prompt: "assign parent entities to these companies"
109
+ │ Response: JSON array of {company, parent}
110
+
111
+ AtomizeParentAssignment
112
+ │ Skip self-referential (child == parent)
113
+ │ Emit: child org → subsidiary_of → parent org
114
+
115
+ Published FetchMessage (.binpb.zst)
116
+ ```
@@ -0,0 +1,58 @@
1
+ # Dataset schema for patent-parents: aggregated US patent counts per assignee
2
+ # from Google Patents Public Datasets.
3
+ #
4
+ # Structured atomization only — no LLM extraction.
5
+ name: "patentparents"
6
+ description: "Aggregated US patent grant counts per assignee from Google Patents Public Datasets (patents-public-data.patents.publications)."
7
+
8
+ extraction:
9
+ flavors: closed
10
+ properties: closed
11
+ relationships: closed
12
+ attributes: closed
13
+ events: closed
14
+
15
+ flavors:
16
+ - name: "organization"
17
+ description: "A particular business, institution, or organization such as a corporation, university, government agency, or non-profit"
18
+ display_name: "Organization"
19
+ mergeability: not_mergeable
20
+ passive: true
21
+
22
+ properties:
23
+ - name: "total_patents"
24
+ namespace: "patentparents"
25
+ type: float
26
+ description: "Total number of US patent grants assigned to this organization in the scanned grant_date window"
27
+ display_name: "Total Patents"
28
+ mergeability: not_mergeable
29
+ domain_flavors: ["organization"]
30
+ passive: true
31
+
32
+ - name: "grant_date_from"
33
+ namespace: "patentparents"
34
+ type: string
35
+ description: "Start of the grant_date window"
36
+ display_name: "Grant Date From"
37
+ mergeability: not_mergeable
38
+ domain_flavors: ["organization"]
39
+ passive: true
40
+
41
+ - name: "grant_date_to"
42
+ namespace: "patentparents"
43
+ type: string
44
+ description: "End of the grant_date window"
45
+ display_name: "Grant Date To"
46
+ mergeability: not_mergeable
47
+ domain_flavors: ["organization"]
48
+ passive: true
49
+
50
+ relationships:
51
+ - name: "subsidiary_of"
52
+ namespace: "patentparents"
53
+ description: "An organization is a subsidiary, affiliate, or controlled entity of another organization (e.g. a special purpose partnership controlled by a developer)."
54
+ display_name: "Subsidiary Of"
55
+ mergeability: not_mergeable
56
+ domain_flavors: ["organization"]
57
+ target_flavors: ["organization"]
58
+ passive: true