onesecondtrader 0.51.0__tar.gz → 0.53.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/PKG-INFO +1 -1
  2. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/pyproject.toml +2 -1
  3. onesecondtrader-0.53.0/src/onesecondtrader/secmaster/__init__.py +15 -0
  4. onesecondtrader-0.53.0/src/onesecondtrader/secmaster/schema_versions/__init__.py +0 -0
  5. onesecondtrader-0.53.0/src/onesecondtrader/secmaster/schema_versions/secmaster_schema_v1.sql +197 -0
  6. onesecondtrader-0.53.0/src/onesecondtrader/secmaster/utils.py +578 -0
  7. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/LICENSE +0 -0
  8. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/README.md +0 -0
  9. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/__init__.py +0 -0
  10. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/brokers/__init__.py +0 -0
  11. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/brokers/base.py +0 -0
  12. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/brokers/simulated.py +0 -0
  13. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/events/__init__.py +0 -0
  14. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/events/base.py +0 -0
  15. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/events/market/__init__.py +0 -0
  16. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/events/market/bar_processed.py +0 -0
  17. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/events/market/bar_received.py +0 -0
  18. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/events/orders/__init__.py +0 -0
  19. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/events/orders/base.py +0 -0
  20. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/events/orders/expirations.py +0 -0
  21. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/events/orders/fills.py +0 -0
  22. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/events/requests/__init__.py +0 -0
  23. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/events/requests/base.py +0 -0
  24. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/events/requests/order_cancellation.py +0 -0
  25. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/events/requests/order_modification.py +0 -0
  26. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/events/requests/order_submission.py +0 -0
  27. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/events/responses/__init__.py +0 -0
  28. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/events/responses/base.py +0 -0
  29. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/events/responses/cancellations.py +0 -0
  30. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/events/responses/modifications.py +0 -0
  31. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/events/responses/orders.py +0 -0
  32. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/indicators/__init__.py +0 -0
  33. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/indicators/base.py +0 -0
  34. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/indicators/market_fields.py +0 -0
  35. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/indicators/moving_averages.py +0 -0
  36. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/messaging/__init__.py +0 -0
  37. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/messaging/eventbus.py +0 -0
  38. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/messaging/subscriber.py +0 -0
  39. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/models/__init__.py +0 -0
  40. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/models/bar_fields.py +0 -0
  41. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/models/bar_period.py +0 -0
  42. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/models/order_types.py +0 -0
  43. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/models/rejection_reasons.py +0 -0
  44. {onesecondtrader-0.51.0 → onesecondtrader-0.53.0}/src/onesecondtrader/models/trade_sides.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: onesecondtrader
3
- Version: 0.51.0
3
+ Version: 0.53.0
4
4
  Summary: The Trading Infrastructure Toolkit for Python. Research, simulate, and deploy algorithmic trading strategies — all in one place.
5
5
  License-File: LICENSE
6
6
  Author: Nils P. Kujath
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "onesecondtrader"
3
- version = "0.51.0"
3
+ version = "0.53.0"
4
4
  description = "The Trading Infrastructure Toolkit for Python. Research, simulate, and deploy algorithmic trading strategies — all in one place."
5
5
  authors = [
6
6
  {name = "Nils P. Kujath",email = "63961429+NilsKujath@users.noreply.github.com"}
@@ -21,6 +21,7 @@ dependencies = [
21
21
 
22
22
  [tool.poetry]
23
23
  packages = [{include = "onesecondtrader", from = "src"}]
24
+ include = ["src/onesecondtrader/**/*.sql"]
24
25
 
25
26
 
26
27
  [tool.poetry.group.dev.dependencies]
@@ -0,0 +1,15 @@
1
+ """
2
+ Provides a schema for creating and utilities to populate the security master database.
3
+ """
4
+
5
+ from .utils import (
6
+ create_secmaster_db,
7
+ ingest_databento_zip,
8
+ ingest_databento_dbn,
9
+ )
10
+
11
+ __all__ = [
12
+ "create_secmaster_db",
13
+ "ingest_databento_zip",
14
+ "ingest_databento_dbn",
15
+ ]
@@ -0,0 +1,197 @@
1
+ -- Security master database schema.
2
+ --
3
+ -- The schema is designed for Databento-native ingestion via DBN files, while remaining compatible with other sources.
4
+ -- Instrument identity is modeled per publisher namespace and supports either numeric upstream identifiers or symbols.
5
+ -- Contract specifications and other static reference metadata are intentionally out of scope for this schema and should be stored separately if ingested.
6
+ --
7
+ -- The schema is explicitly ingestion-safe in the sense that:
8
+ --
9
+ -- 1) publishers are keyed by (vendor, dataset) rather than vendor alone, allowing multiple feeds per vendor;
10
+ -- 2) symbology admits multiple mappings sharing the same start date by including the resolved instrument identifier
11
+ -- in the primary key, preventing accidental overwrites during bulk ingestion.
12
+ --
13
+ -- | Table | Description |
14
+ -- |---------------|-------------|
15
+ -- | `publishers` | Registry of vendor+dataset namespaces used for market data and instrument ingestion. |
16
+ -- | `instruments` | Registry of instruments observed from ingestion within a publisher namespace. |
17
+ -- | `ohlcv` | Aggregated OHLCV bar data keyed by instrument, bar duration (`rtype`), and event timestamp (`ts_event`). |
18
+ -- | `symbology` | Time-bounded mappings from publisher-native symbols to publisher-native instrument identifiers. |
19
+
20
+
21
+
22
+ -- Registry of all data sources used for market data and instrument ingestion.
23
+ --
24
+ -- Each row represents a distinct data product (feed) within a vendor namespace.
25
+ -- A publisher record is uniquely identified by the pair (`name`, `dataset`), not by `name` alone.
26
+ -- This allows a single vendor (e.g. Databento) to appear multiple times, once per concrete dataset/feed
27
+ -- (e.g. `GLBX.MDP3`, `XNAS.ITCH`).
28
+ --
29
+ -- A publisher establishes the provenance of instrument definitions and price data and provides the context
30
+ -- in which raw symbols and native instrument identifiers are interpreted.
31
+ --
32
+ -- | Field | Type | Constraints | Description |
33
+ -- |-----------------|-----------|-------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
34
+ -- | `publisher_id` | `INTEGER` | `PRIMARY KEY` | Internal surrogate key uniquely identifying a publisher record within the system. |
35
+ -- | `name` | `TEXT` | `NOT NULL` | Human-readable vendor identifier for the data source (e.g. `databento`, `yfinance`). |
36
+ -- | `dataset` | `TEXT` | `NOT NULL` | Identifier of the concrete data product or feed through which data is sourced; uses Databento dataset names (e.g. `GLBX.MDP3`) for Databento ingestion and internal identifiers for other sources (e.g. `YFINANCE`). |
37
+ -- | `venue` | `TEXT` | | Optional ISO 10383 Market Identifier Code (MIC) describing the primary trading venue; may be NULL for aggregated or multi-venue sources. |
38
+ --
39
+ -- **Table constraints**
40
+ --
41
+ -- * `UNIQUE(name, dataset)` ensures that each vendor+feed combination is represented at most once.
42
+ --
43
+ -- **Examples**
44
+ --
45
+ -- Databento CME Globex feed:
46
+ --
47
+ -- * `name` = `'databento'`
48
+ -- * `dataset` = `'GLBX.MDP3'`
49
+ -- * `venue` = `XCME`
50
+ --
51
+ -- Databento NASDAQ TotalView feed:
52
+ --
53
+ -- * `name` = `'databento'`
54
+ -- * `dataset` = `'XNAS.ITCH'`
55
+ -- * `venue` = `XNAS`
56
+ --
57
+ -- Yahoo Finance equity data:
58
+ --
59
+ -- * `name` = `'yfinance'`
60
+ -- * `dataset` = `'YFINANCE'`
61
+ -- * `venue` = `NULL`
62
+ --
63
+ CREATE TABLE publishers (
64
+ publisher_id INTEGER PRIMARY KEY,
65
+ name TEXT NOT NULL,
66
+ dataset TEXT NOT NULL,
67
+ venue TEXT,
68
+ UNIQUE (name, dataset)
69
+ );
70
+
71
+
72
+
73
+
74
+
75
+ -- Registry of instruments observed through market data ingestion.
76
+ --
77
+ -- Each row represents an instrument identity within a publisher namespace.
78
+ -- Instruments may be identified by a publisher-native numeric identifier, a symbol identifier, or both.
79
+ -- Databento ingestion uses `source_instrument_id` as the primary identifier and may optionally store a symbol from symbology.
80
+ -- Symbol-first sources such as yfinance use `symbol` as the primary identifier and typically leave `source_instrument_id` to be `NULL`.
81
+ --
82
+ -- The table does not store contract specifications or other reference metadata.
83
+ -- Such metadata must be stored separately when available.
84
+ --
85
+ -- | Field | Type | Constraints | Description |
86
+ -- |------------------------|-----------|------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------|
87
+ -- | `instrument_id` | `INTEGER` | `PRIMARY KEY` | Internal surrogate key identifying an instrument record within the system. |
88
+ -- | `publisher_ref` | `INTEGER` | `NOT NULL`, `FK` | Foreign key reference to `publishers.publisher_id`, defining the publisher namespace in which this instrument identity is valid. |
89
+ -- | `source_instrument_id` | `INTEGER` | | Publisher-native numeric instrument identifier as provided by the upstream data source (e.g. Databento instrument_id); may be `NULL` for symbol-only sources. |
90
+ -- | `symbol` | `TEXT` | | Publisher-native symbol string identifying the instrument (e.g. raw symbol, ticker); may be NULL when numeric identifiers are used. |
91
+ -- | `symbol_type` | `TEXT` | | Identifier describing the symbol scheme or resolution type used by the publisher (e.g. `raw_symbol`, `continuous`, `ticker`). |
92
+ --
93
+ -- Each instrument must be identifiable by at least one of `source_instrument_id` or `symbol`.
94
+ -- Uniqueness constraints ensure that instrument identities do not collide within a publisher namespace.
95
+ -- The table intentionally excludes contract specifications and other reference metadata, which must be stored separately when available.
96
+ --
97
+ CREATE TABLE instruments (
98
+ instrument_id INTEGER PRIMARY KEY,
99
+
100
+ publisher_ref INTEGER NOT NULL,
101
+
102
+ source_instrument_id INTEGER,
103
+ symbol TEXT,
104
+ symbol_type TEXT,
105
+
106
+ FOREIGN KEY (publisher_ref) REFERENCES publishers(publisher_id),
107
+
108
+ CHECK (
109
+ source_instrument_id IS NOT NULL
110
+ OR symbol IS NOT NULL
111
+ ),
112
+
113
+ CHECK (symbol IS NULL OR symbol_type IS NOT NULL),
114
+
115
+ UNIQUE (publisher_ref, source_instrument_id),
116
+ UNIQUE (publisher_ref, symbol, symbol_type)
117
+ );
118
+
119
+
120
+
121
+
122
+
123
+
124
+ -- Stores aggregated OHLCV bars for instruments at multiple time resolutions.
125
+ --
126
+ -- | Field | Type | Constraints | Description |
127
+ -- |-----------------|-----------|---------------------------------------------|--------------------------------------------------------------------------------------------------------------------------|
128
+ -- | `instrument_id` | `INTEGER` | `NOT NULL`, `FK` | Foreign key reference to `instruments.instrument_id`, identifying the instrument to which this bar belongs. |
129
+ -- | `rtype` | `INTEGER` | `NOT NULL`, `CHECK IN (32, 33, 34, 35, 36)` | Record type code encoding the bar duration using Databento OHLCV conventions (e.g. `32`=1s, `33`=1m, `34`=1h, `35`=1d). |
130
+ -- | `ts_event` | `INTEGER` | `NOT NULL` | Event timestamp of the bar as provided by the upstream source, stored as nanoseconds since the UTC Unix epoch. |
131
+ -- | `open` | `INTEGER` | `NOT NULL` | Opening price of the bar interval, stored as a fixed-point integer using the upstream price scaling convention. |
132
+ -- | `high` | `INTEGER` | `NOT NULL` | Highest traded price during the bar interval, stored as a fixed-point integer. |
133
+ -- | `low` | `INTEGER` | `NOT NULL`, `CHECK(low <= high)` | Lowest traded price during the bar interval, stored as a fixed-point integer. |
134
+ -- | `close` | `INTEGER` | `NOT NULL` | Closing price of the bar interval, stored as a fixed-point integer. |
135
+ -- | `volume` | `INTEGER` | `NOT NULL`, `CHECK(volume >= 0)` | Total traded volume during the bar interval. |
136
+ --
137
+ -- The composite primary key enforces uniqueness per instrument, bar duration, and event timestamp.
138
+ -- Integrity constraints ensure basic OHLC consistency and prevent invalid price relationships from being stored.
139
+ -- The table uses `WITHOUT ROWID` to store rows directly in the primary key B-tree for reduced storage overhead and faster lookups.
140
+ --
141
+ CREATE TABLE ohlcv (
142
+ instrument_id INTEGER NOT NULL,
143
+ rtype INTEGER NOT NULL CHECK(rtype IN (32, 33, 34, 35, 36)),
144
+ ts_event INTEGER NOT NULL,
145
+ open INTEGER NOT NULL,
146
+ high INTEGER NOT NULL,
147
+ low INTEGER NOT NULL,
148
+ close INTEGER NOT NULL,
149
+ volume INTEGER NOT NULL CHECK(volume >= 0),
150
+ FOREIGN KEY (instrument_id) REFERENCES instruments(instrument_id),
151
+ PRIMARY KEY (instrument_id, rtype, ts_event),
152
+ CHECK(low <= high),
153
+ CHECK(open BETWEEN low AND high),
154
+ CHECK(close BETWEEN low AND high)
155
+ ) WITHOUT ROWID;
156
+
157
+
158
+
159
+
160
+
161
+ -- Stores time-bounded mappings from publisher-native symbols to publisher-native instrument identifiers.
162
+ --
163
+ -- The table captures symbol resolution rules as provided by upstream data sources and must be interpreted within the
164
+ -- namespace of a specific publisher.
165
+ --
166
+ -- The schema permits multiple mappings to share the same `start_date` for a given (`publisher_ref`, `symbol`, `symbol_type`)
167
+ -- by including `source_instrument_id` in the primary key. This prevents accidental overwrite when upstream symbology exports
168
+ -- contain same-day corrections, backfills, or parallel resolution segments.
169
+ --
170
+ -- | Field | Type | Constraints | Description |
171
+ -- |------------------------|-----------|------------------|----------------------------------------------------------------------------------------------------------------------------|
172
+ -- | `publisher_ref` | `INTEGER` | `NOT NULL`, `FK` | Foreign key reference to `publishers.publisher_id`, defining the publisher namespace in which the symbol mapping is valid. |
173
+ -- | `symbol` | `TEXT` | `NOT NULL` | Publisher-native symbol string as provided by the upstream source (e.g. raw symbol, continuous symbol). |
174
+ -- | `symbol_type` | `TEXT` | `NOT NULL` | Identifier describing the symbol scheme or resolution type used by the publisher (e.g. `raw_symbol`, `continuous`). |
175
+ -- | `source_instrument_id` | `INTEGER` | `NOT NULL` | Publisher-native numeric instrument identifier corresponding to the resolved symbol. |
176
+ -- | `start_date` | `TEXT` | `NOT NULL` | First calendar date (inclusive) on which this symbol-to-instrument mapping is valid, stored in YYYY-MM-DD format. |
177
+ -- | `end_date` | `TEXT` | `NOT NULL` | Last calendar date (inclusive) on which this symbol-to-instrument mapping is valid, stored in YYYY-MM-DD format. |
178
+ --
179
+ -- The primary key enforces uniqueness of mappings at the granularity of a resolved instrument.
180
+ -- Date bounds are interpreted as closed intervals.
181
+ --
182
+ CREATE TABLE symbology (
183
+ publisher_ref INTEGER NOT NULL,
184
+ symbol TEXT NOT NULL,
185
+ symbol_type TEXT NOT NULL,
186
+ source_instrument_id INTEGER NOT NULL,
187
+ start_date TEXT NOT NULL,
188
+ end_date TEXT NOT NULL,
189
+ FOREIGN KEY (publisher_ref) REFERENCES publishers(publisher_id),
190
+ FOREIGN KEY (publisher_ref, source_instrument_id)
191
+ REFERENCES instruments(publisher_ref, source_instrument_id),
192
+ PRIMARY KEY (publisher_ref, symbol, symbol_type, start_date, source_instrument_id),
193
+ CHECK (start_date <= end_date)
194
+ );
195
+
196
+
197
+ PRAGMA user_version = 1;
@@ -0,0 +1,578 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ import pathlib
6
+ import shutil
7
+ import sqlite3
8
+ import tempfile
9
+ import zipfile
10
+
11
+ import databento
12
+
13
+
14
+ BATCH_SIZE = 10000
15
+ LOG_EVERY_OHLCV = 1_000_000
16
+ LOG_EVERY_SYMBOLOGY = 50_000
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def create_secmaster_db(db_path: pathlib.Path, schema_version: int = 1) -> pathlib.Path:
22
+ """
23
+ Create a new security master SQLite database using a selected schema version.
24
+
25
+ The database file is created at the given path and initialized by executing the SQL script
26
+ located in the `schema_versions` directory adjacent to this module.
27
+
28
+ The function expects the schema script to set `PRAGMA user_version` to the corresponding
29
+ schema version and verifies this after execution.
30
+
31
+ Parameters:
32
+ db_path:
33
+ Filesystem path at which the SQLite database file will be created.
34
+ schema_version:
35
+ Version number selecting the schema script to apply.
36
+
37
+ Returns:
38
+ The path to the created database file.
39
+
40
+ Raises:
41
+ FileExistsError:
42
+ If a file already exists at `db_path`.
43
+ FileNotFoundError:
44
+ If the schema script for `schema_version` does not exist.
45
+ sqlite3.DatabaseError:
46
+ If the applied schema does not set the expected `user_version` or if SQLite fails
47
+ while executing the schema.
48
+ """
49
+ if db_path.exists():
50
+ raise FileExistsError(f"Database already exists: {db_path}")
51
+
52
+ schema_path = (
53
+ pathlib.Path(__file__).resolve().parent
54
+ / "schema_versions"
55
+ / f"secmaster_schema_v{schema_version}.sql"
56
+ )
57
+
58
+ if not schema_path.is_file():
59
+ raise FileNotFoundError(
60
+ f"Schema version {schema_version} not found: {schema_path}"
61
+ )
62
+
63
+ db_path.parent.mkdir(parents=True, exist_ok=True)
64
+
65
+ schema_sql = schema_path.read_text(encoding="utf-8")
66
+
67
+ with sqlite3.connect(str(db_path)) as con:
68
+ con.execute("PRAGMA foreign_keys = ON;")
69
+ con.executescript(schema_sql)
70
+
71
+ row = con.execute("PRAGMA user_version;").fetchone()
72
+ actual_version = int(row[0]) if row else 0
73
+
74
+ if actual_version != schema_version:
75
+ raise sqlite3.DatabaseError(
76
+ f"Schema script set user_version={actual_version}, expected {schema_version}"
77
+ )
78
+
79
+ return db_path
80
+
81
+
82
+ def ingest_databento_zip(
83
+ zip_path: pathlib.Path,
84
+ db_path: pathlib.Path,
85
+ publisher_name: str = "databento",
86
+ symbol_type: str = "raw_symbol",
87
+ dataset: str | None = None,
88
+ ) -> tuple[int, int]:
89
+ """
90
+ Ingest market data from a Databento zip archive into the security master database.
91
+
92
+ The archive may contain one or more DBN files and an optional `symbology.json`. The function
93
+ ingests OHLCV records from DBN files into `ohlcv` and ingests symbol-to-instrument mappings
94
+ into `symbology`.
95
+
96
+ The publisher namespace is created if absent. Publisher identity is determined by the pair
97
+ `(publisher_name, dataset)`, where `dataset` is extracted from `metadata.json` in the archive.
98
+
99
+ Ingestion is idempotent with respect to primary keys: existing `ohlcv` and `symbology` rows are
100
+ left unchanged.
101
+
102
+ Parameters:
103
+ zip_path:
104
+ Path to the Databento zip archive.
105
+ db_path:
106
+ Path to the security master SQLite database.
107
+ publisher_name:
108
+ Vendor name stored in `publishers.name`. The dataset is derived from archive metadata.
109
+ symbol_type:
110
+ Symbol scheme stored in `symbology.symbol_type` for symbols found in `symbology.json`.
111
+ dataset:
112
+ Optional dataset override. If provided, it is used when `metadata.json` is missing or
113
+ does not specify a dataset.
114
+
115
+ Returns:
116
+ A tuple of (ohlcv_record_count_seen, symbology_record_count_seen).
117
+ """
118
+ ohlcv_count = 0
119
+ symbology_count = 0
120
+
121
+ logger.info("Opening Databento archive: %s", zip_path)
122
+
123
+ if not db_path.is_file():
124
+ raise FileNotFoundError(f"Security master DB not found: {db_path}")
125
+
126
+ con = sqlite3.connect(str(db_path))
127
+
128
+ try:
129
+ con.execute("PRAGMA foreign_keys = ON;")
130
+ _assert_secmaster_db(con)
131
+ _enable_bulk_loading(con)
132
+
133
+ with con:
134
+ with zipfile.ZipFile(zip_path, "r") as zf:
135
+ dataset, venue = _extract_dataset_info(zf, dataset_override=dataset)
136
+ logger.info(
137
+ "Publisher resolved: name=%s dataset=%s venue=%s",
138
+ publisher_name,
139
+ dataset,
140
+ venue,
141
+ )
142
+ publisher_id = _get_or_create_publisher(
143
+ con, publisher_name, dataset, venue
144
+ )
145
+
146
+ with tempfile.TemporaryDirectory() as tmpdir:
147
+ dbn_files = [
148
+ n
149
+ for n in zf.namelist()
150
+ if n.endswith(".dbn.zst") or n.endswith(".dbn")
151
+ ]
152
+ symbology_member = _zip_find_member(zf, "symbology.json")
153
+
154
+ if not dbn_files and symbology_member is None:
155
+ raise ValueError(
156
+ "Archive contains no DBN files and no symbology.json"
157
+ )
158
+
159
+ logger.info("Found %d DBN file(s) in archive", len(dbn_files))
160
+
161
+ for name in dbn_files:
162
+ extracted_path = _zip_member_to_tempfile(zf, name, tmpdir)
163
+ try:
164
+ logger.info("Ingesting DBN file: %s", extracted_path.name)
165
+ ohlcv_count += _ingest_dbn(
166
+ extracted_path, con, publisher_id
167
+ )
168
+ finally:
169
+ try:
170
+ extracted_path.unlink()
171
+ except FileNotFoundError:
172
+ pass
173
+
174
+ if symbology_member is not None:
175
+ symbology_path = _zip_member_to_tempfile(
176
+ zf, symbology_member, tmpdir
177
+ )
178
+ try:
179
+ logger.info("Ingesting symbology.json")
180
+ symbology_count += _ingest_symbology(
181
+ symbology_path,
182
+ con,
183
+ publisher_id,
184
+ symbol_type=symbol_type,
185
+ )
186
+ finally:
187
+ try:
188
+ symbology_path.unlink()
189
+ except FileNotFoundError:
190
+ pass
191
+ else:
192
+ logger.info("No symbology.json present in archive")
193
+ finally:
194
+ try:
195
+ _disable_bulk_loading(con)
196
+ finally:
197
+ con.close()
198
+
199
+ logger.info(
200
+ "Finished zip ingestion: %s (%d OHLCV records, %d symbology records)",
201
+ zip_path.name,
202
+ ohlcv_count,
203
+ symbology_count,
204
+ )
205
+
206
+ return ohlcv_count, symbology_count
207
+
208
+
209
+ def ingest_databento_dbn(
210
+ dbn_path: pathlib.Path,
211
+ db_path: pathlib.Path,
212
+ publisher_name: str = "databento",
213
+ ) -> int:
214
+ """
215
+ Ingest market data from a Databento DBN file into the security master database.
216
+
217
+ Reads OHLCV records from the DBN file and inserts them into `ohlcv`. The publisher namespace
218
+ is created if absent. Publisher identity is determined by the pair `(publisher_name, dataset)`,
219
+ where `dataset` is read from DBN metadata.
220
+
221
+ Ingestion is idempotent with respect to primary keys: existing bars are left unchanged.
222
+
223
+ Parameters:
224
+ dbn_path:
225
+ Path to the DBN file (.dbn or .dbn.zst).
226
+ db_path:
227
+ Path to the security master SQLite database.
228
+ publisher_name:
229
+ Vendor name stored in `publishers.name`. The dataset is derived from DBN metadata.
230
+
231
+ Returns:
232
+ The number of OHLCV records seen in the DBN stream.
233
+ """
234
+ logger.info("Starting DBN ingestion: %s", dbn_path)
235
+
236
+ if not db_path.is_file():
237
+ raise FileNotFoundError(f"Security master DB not found: {db_path}")
238
+
239
+ con = sqlite3.connect(str(db_path))
240
+
241
+ try:
242
+ con.execute("PRAGMA foreign_keys = ON;")
243
+ _assert_secmaster_db(con)
244
+ _enable_bulk_loading(con)
245
+
246
+ with con:
247
+ store = databento.DBNStore.from_file(dbn_path)
248
+ dataset = store.metadata.dataset
249
+ if not dataset:
250
+ raise ValueError(f"DBN metadata missing dataset: {dbn_path}")
251
+ venue = dataset.split(".")[0] if "." in dataset else None
252
+
253
+ logger.info(
254
+ "Publisher resolved: name=%s dataset=%s venue=%s",
255
+ publisher_name,
256
+ dataset,
257
+ venue,
258
+ )
259
+
260
+ publisher_id = _get_or_create_publisher(con, publisher_name, dataset, venue)
261
+ count = _ingest_dbn(dbn_path, con, publisher_id)
262
+ finally:
263
+ try:
264
+ _disable_bulk_loading(con)
265
+ finally:
266
+ con.close()
267
+
268
+ logger.info("Finished DBN ingestion: %s (%d OHLCV records)", dbn_path.name, count)
269
+
270
+ return count
271
+
272
+
273
+ def _extract_dataset_info(
274
+ zf: zipfile.ZipFile,
275
+ dataset_override: str | None = None,
276
+ ) -> tuple[str, str | None]:
277
+ metadata_member = _zip_find_member(zf, "metadata.json")
278
+ if metadata_member is None:
279
+ if dataset_override is None:
280
+ raise ValueError(
281
+ "Archive is missing metadata.json and no dataset override was provided"
282
+ )
283
+ dataset = dataset_override
284
+ else:
285
+ with zf.open(metadata_member) as f:
286
+ metadata = json.load(f)
287
+ dataset = metadata.get("query", {}).get("dataset")
288
+ if not dataset:
289
+ if dataset_override is None:
290
+ raise ValueError(
291
+ f"metadata.json is missing query.dataset (member={metadata_member!r})"
292
+ )
293
+ dataset = dataset_override
294
+
295
+ venue = dataset.split(".")[0] if "." in dataset else None
296
+ return dataset, venue
297
+
298
+
299
+ def _zip_find_member(
300
+ zf: zipfile.ZipFile,
301
+ basename: str,
302
+ allow_multiple: bool = False,
303
+ ) -> str | None:
304
+ candidates = [
305
+ name
306
+ for name in zf.namelist()
307
+ if name == basename or name.endswith("/" + basename)
308
+ ]
309
+ if not candidates:
310
+ return None
311
+ if len(candidates) == 1:
312
+ return candidates[0]
313
+
314
+ candidates = sorted(candidates)
315
+ if not allow_multiple:
316
+ raise ValueError(f"Multiple {basename} members found in archive: {candidates}")
317
+
318
+ selected = candidates[0]
319
+ logger.warning("Multiple %s found in archive; using %s", basename, selected)
320
+ return selected
321
+
322
+
323
+ def _zip_member_to_tempfile(
324
+ zf: zipfile.ZipFile,
325
+ member_name: str,
326
+ tmpdir: str,
327
+ ) -> pathlib.Path:
328
+ suffix = "".join(pathlib.PurePosixPath(member_name).suffixes)
329
+ with tempfile.NamedTemporaryFile(
330
+ mode="wb",
331
+ suffix=suffix,
332
+ delete=False,
333
+ dir=tmpdir,
334
+ ) as tmp:
335
+ with zf.open(member_name) as src:
336
+ shutil.copyfileobj(src, tmp)
337
+ return pathlib.Path(tmp.name)
338
+
339
+
340
+ def _get_or_create_publisher(
341
+ con: sqlite3.Connection,
342
+ name: str,
343
+ dataset: str,
344
+ venue: str | None,
345
+ ) -> int:
346
+ cursor = con.cursor()
347
+ cursor.execute(
348
+ "SELECT publisher_id FROM publishers WHERE name = ? AND dataset = ?",
349
+ (name, dataset),
350
+ )
351
+ row = cursor.fetchone()
352
+ if row:
353
+ return row[0]
354
+
355
+ cursor.execute(
356
+ "INSERT INTO publishers (name, dataset, venue) VALUES (?, ?, ?)",
357
+ (name, dataset, venue),
358
+ )
359
+ return cursor.lastrowid # type: ignore[return-value]
360
+
361
+
362
+ def _get_or_create_instrument(
363
+ con: sqlite3.Connection,
364
+ publisher_id: int,
365
+ source_instrument_id: int,
366
+ ) -> int:
367
+ cursor = con.cursor()
368
+ cursor.execute(
369
+ "SELECT instrument_id FROM instruments WHERE publisher_ref = ? AND source_instrument_id = ?",
370
+ (publisher_id, source_instrument_id),
371
+ )
372
+ row = cursor.fetchone()
373
+ if row:
374
+ return row[0]
375
+
376
+ cursor.execute(
377
+ "INSERT INTO instruments (publisher_ref, source_instrument_id) VALUES (?, ?)",
378
+ (publisher_id, source_instrument_id),
379
+ )
380
+ return cursor.lastrowid # type: ignore[return-value]
381
+
382
+
383
+ def _assert_secmaster_db(
384
+ con: sqlite3.Connection, expected_user_version: int = 1
385
+ ) -> None:
386
+ row = con.execute("PRAGMA user_version;").fetchone()
387
+ user_version = int(row[0]) if row else 0
388
+ if user_version != expected_user_version:
389
+ raise sqlite3.DatabaseError(
390
+ "Security master schema user_version="
391
+ f"{user_version} does not match expected {expected_user_version}"
392
+ )
393
+
394
+ required = {"publishers", "instruments", "ohlcv", "symbology"}
395
+ present = {
396
+ r[0]
397
+ for r in con.execute(
398
+ "SELECT name FROM sqlite_master WHERE type = 'table'"
399
+ ).fetchall()
400
+ }
401
+ missing = sorted(required - present)
402
+ if missing:
403
+ raise sqlite3.DatabaseError(
404
+ f"Security master schema missing required tables: {', '.join(missing)}"
405
+ )
406
+
407
+
408
+ def _ingest_dbn(
409
+ dbn_path: pathlib.Path,
410
+ con: sqlite3.Connection,
411
+ publisher_id: int,
412
+ ) -> int:
413
+ store = databento.DBNStore.from_file(dbn_path)
414
+ cursor = con.cursor()
415
+
416
+ instrument_cache: dict[int, int] = {}
417
+ batch: list[tuple] = []
418
+ count = 0
419
+
420
+ logger.info("Streaming OHLCV records from: %s", dbn_path.name)
421
+
422
+ for record in store:
423
+ if not isinstance(record, databento.OHLCVMsg):
424
+ continue
425
+
426
+ source_id = record.instrument_id
427
+ if source_id not in instrument_cache:
428
+ instrument_cache[source_id] = _get_or_create_instrument(
429
+ con, publisher_id, source_id
430
+ )
431
+ internal_id = instrument_cache[source_id]
432
+
433
+ rtype_val = (
434
+ record.rtype.value if hasattr(record.rtype, "value") else record.rtype
435
+ )
436
+
437
+ batch.append(
438
+ (
439
+ internal_id,
440
+ rtype_val,
441
+ record.ts_event,
442
+ record.open,
443
+ record.high,
444
+ record.low,
445
+ record.close,
446
+ record.volume,
447
+ )
448
+ )
449
+ count += 1
450
+
451
+ if count % LOG_EVERY_OHLCV == 0:
452
+ logger.info("Ingested %d OHLCV records from %s", count, dbn_path.name)
453
+
454
+ if len(batch) >= BATCH_SIZE:
455
+ cursor.executemany(
456
+ "INSERT OR IGNORE INTO ohlcv "
457
+ "(instrument_id, rtype, ts_event, open, high, low, close, volume) "
458
+ "VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
459
+ batch,
460
+ )
461
+ batch.clear()
462
+
463
+ if batch:
464
+ cursor.executemany(
465
+ "INSERT OR IGNORE INTO ohlcv "
466
+ "(instrument_id, rtype, ts_event, open, high, low, close, volume) "
467
+ "VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
468
+ batch,
469
+ )
470
+
471
+ logger.info("Completed OHLCV ingest from %s (%d records)", dbn_path.name, count)
472
+
473
+ return count
474
+
475
+
476
+ def _ingest_symbology(
477
+ json_path: pathlib.Path,
478
+ con: sqlite3.Connection,
479
+ publisher_id: int,
480
+ symbol_type: str = "raw_symbol",
481
+ ) -> int:
482
+ if not isinstance(symbol_type, str) or not symbol_type:
483
+ raise ValueError("symbol_type must be a non-empty string")
484
+
485
+ with open(json_path, "r") as f:
486
+ data = json.load(f)
487
+
488
+ if not isinstance(data, dict):
489
+ raise ValueError("symbology.json root must be a JSON object")
490
+
491
+ result = data.get("result", {})
492
+ if not isinstance(result, dict):
493
+ raise ValueError("symbology.json['result'] must be an object")
494
+ cursor = con.cursor()
495
+
496
+ batch: list[tuple] = []
497
+ count = 0
498
+
499
+ logger.info("Streaming symbology mappings from: %s", json_path.name)
500
+
501
+ instrument_cache: set[int] = set()
502
+
503
+ for symbol, mappings in result.items():
504
+ if not isinstance(mappings, list):
505
+ raise ValueError(
506
+ f"symbology.json mappings must be a list for symbol={symbol!r}"
507
+ )
508
+
509
+ for i, mapping in enumerate(mappings):
510
+ if not isinstance(mapping, dict):
511
+ raise ValueError(
512
+ f"symbology.json mapping must be an object at symbol={symbol!r} index={i}"
513
+ )
514
+
515
+ missing_keys = [k for k in ("s", "d0", "d1") if k not in mapping]
516
+ if missing_keys:
517
+ raise ValueError(
518
+ "symbology.json mapping missing key(s) "
519
+ f"{missing_keys} at symbol={symbol!r} index={i}"
520
+ )
521
+
522
+ source_id = int(mapping["s"])
523
+
524
+ if source_id not in instrument_cache:
525
+ _get_or_create_instrument(con, publisher_id, source_id)
526
+ instrument_cache.add(source_id)
527
+
528
+ batch.append(
529
+ (
530
+ publisher_id,
531
+ symbol,
532
+ symbol_type,
533
+ source_id,
534
+ mapping["d0"],
535
+ mapping["d1"],
536
+ )
537
+ )
538
+ count += 1
539
+
540
+ if count % LOG_EVERY_SYMBOLOGY == 0:
541
+ logger.info(
542
+ "Ingested %d symbology mappings from %s", count, json_path.name
543
+ )
544
+
545
+ if len(batch) >= BATCH_SIZE:
546
+ cursor.executemany(
547
+ "INSERT OR IGNORE INTO symbology "
548
+ "(publisher_ref, symbol, symbol_type, source_instrument_id, start_date, end_date) "
549
+ "VALUES (?, ?, ?, ?, ?, ?)",
550
+ batch,
551
+ )
552
+ batch.clear()
553
+
554
+ if batch:
555
+ cursor.executemany(
556
+ "INSERT OR IGNORE INTO symbology "
557
+ "(publisher_ref, symbol, symbol_type, source_instrument_id, start_date, end_date) "
558
+ "VALUES (?, ?, ?, ?, ?, ?)",
559
+ batch,
560
+ )
561
+
562
+ logger.info(
563
+ "Completed symbology ingest from %s (%d mappings)", json_path.name, count
564
+ )
565
+
566
+ return count
567
+
568
+
569
+ def _enable_bulk_loading(con: sqlite3.Connection) -> None:
570
+ con.execute("PRAGMA journal_mode = WAL")
571
+ con.execute("PRAGMA synchronous = NORMAL")
572
+ con.execute("PRAGMA cache_size = -64000")
573
+
574
+
575
+ def _disable_bulk_loading(con: sqlite3.Connection) -> None:
576
+ con.execute("PRAGMA synchronous = FULL")
577
+ con.execute("PRAGMA journal_mode = DELETE")
578
+ con.execute("PRAGMA cache_size = -2000")