any2heliosdb 0.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- any2heliosdb/__init__.py +16 -0
- any2heliosdb/__main__.py +7 -0
- any2heliosdb/assess/__init__.py +28 -0
- any2heliosdb/assess/inventory.py +78 -0
- any2heliosdb/assess/render.py +161 -0
- any2heliosdb/assess/report.py +125 -0
- any2heliosdb/cdc/__init__.py +0 -0
- any2heliosdb/cdc/engine.py +143 -0
- any2heliosdb/cdc/registry.py +76 -0
- any2heliosdb/cdc/replicat.py +109 -0
- any2heliosdb/cdc/sinks/__init__.py +0 -0
- any2heliosdb/cdc/sources/__init__.py +0 -0
- any2heliosdb/cdc/sources/mysql_binlog.py +193 -0
- any2heliosdb/cdc/sources/oracle_scn.py +43 -0
- any2heliosdb/cdc/trail.py +51 -0
- any2heliosdb/chunking/__init__.py +0 -0
- any2heliosdb/chunking/pk_range.py +65 -0
- any2heliosdb/cli.py +489 -0
- any2heliosdb/config/__init__.py +0 -0
- any2heliosdb/config/model.py +85 -0
- any2heliosdb/config/store.py +146 -0
- any2heliosdb/config/wizard.py +119 -0
- any2heliosdb/constants.py +148 -0
- any2heliosdb/core/__init__.py +0 -0
- any2heliosdb/core/catalog_model.py +367 -0
- any2heliosdb/core/change_record.py +86 -0
- any2heliosdb/core/identifiers.py +80 -0
- any2heliosdb/core/loader.py +172 -0
- any2heliosdb/core/manifest.py +304 -0
- any2heliosdb/core/orchestrator.py +333 -0
- any2heliosdb/emit/__init__.py +0 -0
- any2heliosdb/emit/ddl.py +137 -0
- any2heliosdb/emit/mysql_ddl.py +195 -0
- any2heliosdb/emit/oracle_ddl.py +80 -0
- any2heliosdb/errors.py +51 -0
- any2heliosdb/geom/__init__.py +0 -0
- any2heliosdb/mcp/__init__.py +45 -0
- any2heliosdb/mcp/auth.py +179 -0
- any2heliosdb/mcp/protocol.py +141 -0
- any2heliosdb/mcp/server.py +222 -0
- any2heliosdb/mcp/tools.py +554 -0
- any2heliosdb/monitor/__init__.py +12 -0
- any2heliosdb/monitor/live.py +240 -0
- any2heliosdb/plsql/__init__.py +19 -0
- any2heliosdb/plsql/cost.py +47 -0
- any2heliosdb/plsql/gap.py +125 -0
- any2heliosdb/plsql/rewrite.py +351 -0
- any2heliosdb/sources/__init__.py +0 -0
- any2heliosdb/sources/base.py +81 -0
- any2heliosdb/sources/mssql/__init__.py +0 -0
- any2heliosdb/sources/mssql/adapter.py +429 -0
- any2heliosdb/sources/mysql/__init__.py +0 -0
- any2heliosdb/sources/mysql/adapter.py +237 -0
- any2heliosdb/sources/oracle/__init__.py +0 -0
- any2heliosdb/sources/oracle/adapter.py +309 -0
- any2heliosdb/sources/postgres/__init__.py +0 -0
- any2heliosdb/sources/postgres/adapter.py +608 -0
- any2heliosdb/target/__init__.py +0 -0
- any2heliosdb/target/base.py +196 -0
- any2heliosdb/target/capability.py +178 -0
- any2heliosdb/target/copy_codec.py +88 -0
- any2heliosdb/target/mysql_driver.py +239 -0
- any2heliosdb/target/native_driver.py +205 -0
- any2heliosdb/target/psycopg_driver.py +288 -0
- any2heliosdb/typemap/__init__.py +0 -0
- any2heliosdb/typemap/defaults.py +251 -0
- any2heliosdb/typemap/registry.py +83 -0
- any2heliosdb/validate/__init__.py +17 -0
- any2heliosdb/validate/counts.py +56 -0
- any2heliosdb/validate/data.py +244 -0
- any2heliosdb/validate/model.py +60 -0
- any2heliosdb/validate/structure.py +52 -0
- any2heliosdb-0.9.1.dist-info/METADATA +319 -0
- any2heliosdb-0.9.1.dist-info/RECORD +77 -0
- any2heliosdb-0.9.1.dist-info/WHEEL +4 -0
- any2heliosdb-0.9.1.dist-info/entry_points.txt +3 -0
- any2heliosdb-0.9.1.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Replicat: apply captured change records to the target, idempotently.
|
|
2
|
+
|
|
3
|
+
Records are bucketed by table and applied through the target driver's upsert
|
|
4
|
+
(insert/update) and delete-by-key seams, so re-applying the same trail slice is
|
|
5
|
+
a no-op on row state. Tables without a primary key can't be keyed and are
|
|
6
|
+
skipped with a warning.
|
|
7
|
+
|
|
8
|
+
Within a table the trail is an *ordered* stream of explicit I/U/D change
|
|
9
|
+
records, so order is load-bearing: ``DELETE id=7`` then ``INSERT id=7`` must
|
|
10
|
+
leave the row present. The apply therefore walks each table's records in
|
|
11
|
+
arrival order, batching only maximal contiguous runs of the same op class
|
|
12
|
+
(upsert vs delete) so the relative delete/upsert order is preserved while still
|
|
13
|
+
amortizing the round-trips. (This is distinct from :meth:`reconcile_deletes`,
|
|
14
|
+
the snapshot-source key-set diff that has no per-record order to honor.)
|
|
15
|
+
"""
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from typing import Dict, Iterator, List, Tuple
|
|
19
|
+
|
|
20
|
+
from ..core.change_record import DELETE, ChangeRecord
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _runs_by_op_class(recs: List[ChangeRecord]) -> Iterator[Tuple[bool, List[ChangeRecord]]]:
|
|
24
|
+
"""Split records into maximal contiguous runs of one op class, in order.
|
|
25
|
+
|
|
26
|
+
Yields ``(is_delete, run)``: ``is_delete`` True for a run of DELETEs, False
|
|
27
|
+
for a run of INSERT/UPDATE upserts. Splitting only at the upsert<->delete
|
|
28
|
+
boundary preserves the relative ordering that distinguishes ``D`` then ``I``
|
|
29
|
+
(row ends present) from ``I`` then ``D`` (row ends absent), while letting
|
|
30
|
+
same-class neighbors batch into one driver round-trip.
|
|
31
|
+
"""
|
|
32
|
+
run: List[ChangeRecord] = []
|
|
33
|
+
cur_is_delete = False
|
|
34
|
+
for r in recs:
|
|
35
|
+
is_delete = r.op == DELETE
|
|
36
|
+
if run and is_delete != cur_is_delete:
|
|
37
|
+
yield cur_is_delete, run
|
|
38
|
+
run = []
|
|
39
|
+
cur_is_delete = is_delete
|
|
40
|
+
run.append(r)
|
|
41
|
+
if run:
|
|
42
|
+
yield cur_is_delete, run
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class Replicat:
|
|
46
|
+
def __init__(self, target, schema_ir, preserve_case: bool = False) -> None: # type: ignore[no-untyped-def]
|
|
47
|
+
self.target = target
|
|
48
|
+
self.preserve_case = preserve_case
|
|
49
|
+
self._by_name = {t.name.upper(): t for t in schema_ir.tables}
|
|
50
|
+
|
|
51
|
+
def _ident(self, name: str) -> str:
|
|
52
|
+
return name if self.preserve_case else name.lower()
|
|
53
|
+
|
|
54
|
+
def apply(self, records: List[ChangeRecord]) -> Tuple[int, List[str]]:
|
|
55
|
+
buckets: Dict[str, List[ChangeRecord]] = {}
|
|
56
|
+
for r in records:
|
|
57
|
+
buckets.setdefault(r.table, []).append(r)
|
|
58
|
+
|
|
59
|
+
applied = 0
|
|
60
|
+
warnings: List[str] = []
|
|
61
|
+
for tname, recs in buckets.items():
|
|
62
|
+
t = self._by_name.get(tname.upper())
|
|
63
|
+
if t is None or not (t.primary_key and t.primary_key.columns):
|
|
64
|
+
warnings.append("{}: no primary key; skipped {} change(s)".format(tname, len(recs)))
|
|
65
|
+
continue
|
|
66
|
+
target_table = t.target_name(self.preserve_case)
|
|
67
|
+
cols = [c.name for c in t.columns]
|
|
68
|
+
tcols = [self._ident(c) for c in cols]
|
|
69
|
+
key_cols = [self._ident(c) for c in t.primary_key.columns]
|
|
70
|
+
pk = t.primary_key.columns
|
|
71
|
+
|
|
72
|
+
# Apply in arrival order, flushing one op class at a time so that an
|
|
73
|
+
# earlier DELETE is never resurrected by a later batched upsert (and
|
|
74
|
+
# vice versa). Contiguous same-class records are coalesced into a
|
|
75
|
+
# single driver call to keep the round-trip count low.
|
|
76
|
+
for is_delete, run in _runs_by_op_class(recs):
|
|
77
|
+
if is_delete:
|
|
78
|
+
keys = [[r.key.get(c) for c in pk] for r in run]
|
|
79
|
+
applied += self.target.delete_keys(target_table, key_cols, keys)
|
|
80
|
+
else:
|
|
81
|
+
rows = [[r.after.get(c) for c in cols] for r in run]
|
|
82
|
+
applied += self.target.upsert(target_table, key_cols, tcols, rows)
|
|
83
|
+
return applied, warnings
|
|
84
|
+
|
|
85
|
+
def reconcile_deletes(self, source_adapter) -> Tuple[int, List[str]]: # type: ignore[no-untyped-def]
|
|
86
|
+
"""Delete target rows whose PK is absent from the source's current keys.
|
|
87
|
+
|
|
88
|
+
v1 SCN-watermark capture cannot observe DELETEs (the rows are already
|
|
89
|
+
gone), so the replicat reconciles them with a full key-set diff: keys on
|
|
90
|
+
the target but not in the source are removed. This is a full pass
|
|
91
|
+
(cost O(keys)); incremental delete capture is the log-based roadmap.
|
|
92
|
+
"""
|
|
93
|
+
deleted = 0
|
|
94
|
+
warnings: List[str] = []
|
|
95
|
+
for t in self._by_name.values():
|
|
96
|
+
if not (t.primary_key and t.primary_key.columns):
|
|
97
|
+
continue
|
|
98
|
+
pk = t.primary_key.columns
|
|
99
|
+
try:
|
|
100
|
+
src_keys = {tuple(r) for r in source_adapter.stream_rows(t, pk)}
|
|
101
|
+
target_table = t.target_name(self.preserve_case)
|
|
102
|
+
key_idents = [self._ident(c) for c in pk]
|
|
103
|
+
tgt_keys = {tuple(r) for r in self.target.select_keys(target_table, key_idents)}
|
|
104
|
+
extra = [list(k) for k in (tgt_keys - src_keys)]
|
|
105
|
+
if extra:
|
|
106
|
+
deleted += self.target.delete_keys(target_table, key_idents, extra)
|
|
107
|
+
except Exception as e: # noqa: BLE001
|
|
108
|
+
warnings.append("delete reconcile {}: {}".format(t.name, e))
|
|
109
|
+
return deleted, warnings
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""MySQL binlog CDC capture (log-based).
|
|
2
|
+
|
|
3
|
+
Reads ROW-format binlog events (`mysql-replication`) and turns them into
|
|
4
|
+
`ChangeRecord`s — real inserts/updates **and deletes**, unlike the SCN-watermark
|
|
5
|
+
source. The capture cursor is the binlog coordinate ``"<file>:<pos>"``; on the
|
|
6
|
+
first cycle it anchors at the server's *current* position (so only changes after
|
|
7
|
+
the baseline load are captured) and returns no records.
|
|
8
|
+
|
|
9
|
+
Requires the source MySQL to have ``log_bin=ON`` and ``binlog_format=ROW``, and
|
|
10
|
+
the connecting user to hold ``REPLICATION SLAVE``/``REPLICATION CLIENT``.
|
|
11
|
+
|
|
12
|
+
Correctness depends on **full** row images and column metadata: with
|
|
13
|
+
``binlog_row_image=MINIMAL`` an UPDATE only logs the changed columns (the
|
|
14
|
+
replicat would then NULL the omitted ones), and with
|
|
15
|
+
``binlog_row_metadata=MINIMAL`` events carry ``UNKNOWN_COL0..`` placeholders
|
|
16
|
+
instead of real column names. The source therefore fails closed — it verifies
|
|
17
|
+
``binlog_format=ROW``, ``binlog_row_metadata=FULL`` and ``binlog_row_image=FULL``
|
|
18
|
+
before anchoring/capture, and rejects any captured event whose after-image does
|
|
19
|
+
not cover the table's columns.
|
|
20
|
+
"""
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from typing import Dict, List, Tuple
|
|
24
|
+
|
|
25
|
+
from ...core.change_record import DELETE, INSERT, UPDATE, ChangeRecord
|
|
26
|
+
from ...errors import Any2HeliosError
|
|
27
|
+
|
|
28
|
+
# Server variables that must be FULL/ROW for log-based CDC to be lossless.
|
|
29
|
+
_REQUIRED_VARS = (
|
|
30
|
+
("binlog_format", "ROW",
|
|
31
|
+
"binlog row events are required; STATEMENT/MIXED don't carry per-row before/after images"),
|
|
32
|
+
("binlog_row_metadata", "FULL",
|
|
33
|
+
"MINIMAL omits column names, so events expose UNKNOWN_COL0.. instead of real columns"),
|
|
34
|
+
("binlog_row_image", "FULL",
|
|
35
|
+
"MINIMAL logs only changed columns on UPDATE, so unchanged columns would be written as NULL"),
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _require_full_row_image(settings: Dict[str, str]) -> None:
|
|
40
|
+
"""Raise unless ROW binlog with FULL metadata + FULL row image is in effect.
|
|
41
|
+
|
|
42
|
+
``settings`` maps server-variable name -> value (case-insensitively), as read
|
|
43
|
+
from ``SHOW VARIABLES``. Fails closed with a clear, actionable message naming
|
|
44
|
+
the offending variable rather than letting capture silently corrupt the
|
|
45
|
+
target. Pure (no I/O) so it can be unit-tested with plain dicts.
|
|
46
|
+
"""
|
|
47
|
+
norm = {str(k).lower(): ("" if v is None else str(v)) for k, v in settings.items()}
|
|
48
|
+
for var, want, why in _REQUIRED_VARS:
|
|
49
|
+
got = norm.get(var.lower())
|
|
50
|
+
if got is None:
|
|
51
|
+
raise Any2HeliosError(
|
|
52
|
+
"MySQL CDC: could not read '{}' (need {}={}). {}. Grant the connecting "
|
|
53
|
+
"user access to read server variables, or set it server-side.".format(
|
|
54
|
+
var, var, want, why))
|
|
55
|
+
if got.strip().upper() != want:
|
|
56
|
+
raise Any2HeliosError(
|
|
57
|
+
"MySQL CDC requires {}={} but the server reports {}={!r}. {}. Fix it with "
|
|
58
|
+
"`SET GLOBAL {}={}` (needs SYSTEM_VARIABLES_ADMIN; for binlog_format also "
|
|
59
|
+
"restart replication threads) or set it in my.cnf and restart.".format(
|
|
60
|
+
var, want, var, got, why, var, want))
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _check_image_columns(table: str, op: str, image_keys, expected_cols) -> None: # type: ignore[no-untyped-def]
|
|
64
|
+
"""Raise if a captured row image is missing columns or carries placeholders.
|
|
65
|
+
|
|
66
|
+
A FULL row image/metadata stream names every column; a MINIMAL one drops
|
|
67
|
+
unchanged columns and/or surfaces ``UNKNOWN_COL0..`` keys. Either case would
|
|
68
|
+
have the replicat write NULLs over real data, so reject the event loudly.
|
|
69
|
+
``image_keys`` is the dict-key set of the captured value map; ``expected_cols``
|
|
70
|
+
is the table's full column list from the source schema.
|
|
71
|
+
"""
|
|
72
|
+
have = set(image_keys)
|
|
73
|
+
bad = sorted(k for k in have if str(k).upper().startswith("UNKNOWN_COL"))
|
|
74
|
+
if bad:
|
|
75
|
+
raise Any2HeliosError(
|
|
76
|
+
"MySQL CDC: {} on {} carried unnamed columns {} — binlog_row_metadata is not "
|
|
77
|
+
"FULL. Set binlog_row_metadata=FULL on the source and re-anchor.".format(
|
|
78
|
+
op, table, bad))
|
|
79
|
+
missing = [c for c in expected_cols if c not in have]
|
|
80
|
+
if missing:
|
|
81
|
+
raise Any2HeliosError(
|
|
82
|
+
"MySQL CDC: {} on {} omitted columns {} from its row image — binlog_row_image "
|
|
83
|
+
"is not FULL (partial images would be written as NULL). Set "
|
|
84
|
+
"binlog_row_image=FULL on the source and re-anchor.".format(op, table, missing))
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class MySqlBinlogSource:
|
|
88
|
+
def __init__(self, dsn, schema, tables, server_id: int = 4279): # type: ignore[no-untyped-def]
|
|
89
|
+
self.dsn = dsn
|
|
90
|
+
self.schema = schema
|
|
91
|
+
self.server_id = server_id
|
|
92
|
+
self._pk = {t.name: (list(t.primary_key.columns) if t.primary_key else []) for t in tables}
|
|
93
|
+
self._cols = {t.name: [c.name for c in t.columns] for t in tables}
|
|
94
|
+
self._tables = [t.name for t in tables]
|
|
95
|
+
|
|
96
|
+
def _conn_settings(self) -> dict:
|
|
97
|
+
return {"host": self.dsn.host, "port": int(self.dsn.port),
|
|
98
|
+
"user": self.dsn.user, "passwd": self.dsn.password or ""}
|
|
99
|
+
|
|
100
|
+
@staticmethod
|
|
101
|
+
def _read_row_image_vars(cur) -> Dict[str, str]: # type: ignore[no-untyped-def]
|
|
102
|
+
"""Read the binlog_* variables that gate lossless capture into a dict."""
|
|
103
|
+
out: Dict[str, str] = {}
|
|
104
|
+
for var, _want, _why in _REQUIRED_VARS:
|
|
105
|
+
try:
|
|
106
|
+
cur.execute("SHOW VARIABLES LIKE %s", (var,))
|
|
107
|
+
row = cur.fetchone()
|
|
108
|
+
except Exception: # noqa: BLE001
|
|
109
|
+
row = None
|
|
110
|
+
if row:
|
|
111
|
+
# SHOW VARIABLES returns (Variable_name, Value).
|
|
112
|
+
out[str(row[0])] = "" if row[1] is None else str(row[1])
|
|
113
|
+
return out
|
|
114
|
+
|
|
115
|
+
def current_position(self) -> str:
|
|
116
|
+
import pymysql
|
|
117
|
+
|
|
118
|
+
c = pymysql.connect(**{k: v for k, v in self._conn_settings().items() if k != "passwd"},
|
|
119
|
+
password=self.dsn.password or "")
|
|
120
|
+
try:
|
|
121
|
+
cur = c.cursor()
|
|
122
|
+
# Binlog row events only carry column *names* when row metadata is FULL
|
|
123
|
+
# (the default MINIMAL yields UNKNOWN_COL0..). Set it best-effort so
|
|
124
|
+
# events written after this anchor map to real column names. Requires
|
|
125
|
+
# SYSTEM_VARIABLES_ADMIN; if denied, set it server-side (a documented
|
|
126
|
+
# prerequisite alongside log_bin=ON / binlog_format=ROW).
|
|
127
|
+
try:
|
|
128
|
+
cur.execute("SET GLOBAL binlog_row_metadata = FULL")
|
|
129
|
+
except Exception: # noqa: BLE001
|
|
130
|
+
pass
|
|
131
|
+
# Fail closed: anchoring here means every later cycle resumes from this
|
|
132
|
+
# coordinate, so the row-image guarantees must already hold *now*.
|
|
133
|
+
# (binlog_row_image can't be fixed by a SET that only takes effect for
|
|
134
|
+
# new sessions, so verify rather than assume the best-effort SET stuck.)
|
|
135
|
+
_require_full_row_image(self._read_row_image_vars(cur))
|
|
136
|
+
for q in ("SHOW BINARY LOG STATUS", "SHOW MASTER STATUS"): # 8.4 renamed it
|
|
137
|
+
try:
|
|
138
|
+
cur.execute(q)
|
|
139
|
+
row = cur.fetchone()
|
|
140
|
+
if row:
|
|
141
|
+
return "{}:{}".format(row[0], row[1])
|
|
142
|
+
except Exception: # noqa: BLE001
|
|
143
|
+
continue
|
|
144
|
+
return ""
|
|
145
|
+
finally:
|
|
146
|
+
c.close()
|
|
147
|
+
|
|
148
|
+
def capture(self, position: str) -> Tuple[List[ChangeRecord], str]:
|
|
149
|
+
if not position:
|
|
150
|
+
# First cycle: anchor at the current position, capture nothing yet.
|
|
151
|
+
return [], self.current_position()
|
|
152
|
+
|
|
153
|
+
from pymysqlreplication import BinLogStreamReader
|
|
154
|
+
from pymysqlreplication.row_event import (
|
|
155
|
+
DeleteRowsEvent,
|
|
156
|
+
UpdateRowsEvent,
|
|
157
|
+
WriteRowsEvent,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
log_file, _, log_pos = position.rpartition(":")
|
|
161
|
+
stream = BinLogStreamReader(
|
|
162
|
+
connection_settings=self._conn_settings(), server_id=self.server_id,
|
|
163
|
+
only_schemas=[self.schema], only_tables=self._tables,
|
|
164
|
+
only_events=[WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent],
|
|
165
|
+
log_file=log_file, log_pos=int(log_pos), resume_stream=True, blocking=False)
|
|
166
|
+
records: List[ChangeRecord] = []
|
|
167
|
+
try:
|
|
168
|
+
for ev in stream:
|
|
169
|
+
tbl = ev.table
|
|
170
|
+
pk = self._pk.get(tbl, [])
|
|
171
|
+
expected = self._cols.get(tbl, [])
|
|
172
|
+
for row in ev.rows:
|
|
173
|
+
if isinstance(ev, WriteRowsEvent):
|
|
174
|
+
vals = row["values"]
|
|
175
|
+
_check_image_columns(tbl, "INSERT", vals.keys(), expected)
|
|
176
|
+
records.append(ChangeRecord(op=INSERT, schema=self.schema, table=tbl,
|
|
177
|
+
key={k: vals.get(k) for k in pk}, after=dict(vals)))
|
|
178
|
+
elif isinstance(ev, UpdateRowsEvent):
|
|
179
|
+
vals = row["after_values"]
|
|
180
|
+
_check_image_columns(tbl, "UPDATE", vals.keys(), expected)
|
|
181
|
+
records.append(ChangeRecord(op=UPDATE, schema=self.schema, table=tbl,
|
|
182
|
+
key={k: vals.get(k) for k in pk}, after=dict(vals)))
|
|
183
|
+
elif isinstance(ev, DeleteRowsEvent):
|
|
184
|
+
vals = row["values"]
|
|
185
|
+
# Delete only needs a sound key, but UNKNOWN_COL / missing PK
|
|
186
|
+
# still signals a non-FULL image, so verify the key columns.
|
|
187
|
+
_check_image_columns(tbl, "DELETE", vals.keys(), pk)
|
|
188
|
+
records.append(ChangeRecord(op=DELETE, schema=self.schema, table=tbl,
|
|
189
|
+
key={k: vals.get(k) for k in pk}, after={}))
|
|
190
|
+
new_pos = "{}:{}".format(stream.log_file, stream.log_pos)
|
|
191
|
+
finally:
|
|
192
|
+
stream.close()
|
|
193
|
+
return records, new_pos
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""v1 capture: Oracle SCN-watermark.
|
|
2
|
+
|
|
3
|
+
Captures rows whose ``ORA_ROWSCN`` exceeds the extract's watermark (a full
|
|
4
|
+
snapshot on the first cycle, when the watermark is 0) and emits them as upsert
|
|
5
|
+
change records. ``ORA_ROWSCN`` is block-granular without ``ROWDEPENDENCIES``, so
|
|
6
|
+
this may re-emit unchanged neighbours — harmless because the sink upserts on the
|
|
7
|
+
key. Deletes are not visible to a watermark scan; they are a log-based (v2)
|
|
8
|
+
concern. This is the guaranteed-portable Oracle "CDC" for shops without
|
|
9
|
+
LogMiner/supplemental-logging access.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from typing import List, Tuple
|
|
14
|
+
|
|
15
|
+
from ..registry import Extract # noqa: F401 (type reference for callers)
|
|
16
|
+
from ...core.change_record import UPDATE, ChangeRecord
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class OracleScnSource:
|
|
20
|
+
def __init__(self, adapter, schema, tables) -> None: # type: ignore[no-untyped-def]
|
|
21
|
+
self.adapter = adapter
|
|
22
|
+
self.schema = schema
|
|
23
|
+
self.tables = tables
|
|
24
|
+
|
|
25
|
+
def capture(self, since_scn: int) -> Tuple[List[ChangeRecord], int, List[str]]:
|
|
26
|
+
# Anchor the new watermark *before* scanning, so concurrent commits during
|
|
27
|
+
# the scan are picked up next cycle rather than skipped.
|
|
28
|
+
start_scn = self.adapter.current_scn()
|
|
29
|
+
records: List[ChangeRecord] = []
|
|
30
|
+
skipped: List[str] = []
|
|
31
|
+
for t in self.tables:
|
|
32
|
+
if not (t.primary_key and t.primary_key.columns):
|
|
33
|
+
skipped.append(t.name)
|
|
34
|
+
continue
|
|
35
|
+
cols = [c.name for c in t.columns]
|
|
36
|
+
where = None if since_scn <= 0 else "ORA_ROWSCN > {}".format(int(since_scn))
|
|
37
|
+
for row in self.adapter.stream_rows(t, cols, where=where):
|
|
38
|
+
after = {col: row[i] for i, col in enumerate(cols)}
|
|
39
|
+
key = {pk: after[pk] for pk in t.primary_key.columns}
|
|
40
|
+
records.append(ChangeRecord(op=UPDATE, schema=self.schema, table=t.name,
|
|
41
|
+
key=key, after=after, scn=start_scn))
|
|
42
|
+
new_watermark = start_scn if start_scn > 0 else since_scn
|
|
43
|
+
return records, new_watermark, skipped
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Durable append-only change trail (one file per extract).
|
|
2
|
+
|
|
3
|
+
Records are appended as JSON lines and fsync'd before the append returns, so a
|
|
4
|
+
committed record survives a crash. The reader is a simple line cursor: reading
|
|
5
|
+
from cursor N returns every record after line N and the new line count, which
|
|
6
|
+
the replicat persists only *after* a successful apply (at-least-once; combined
|
|
7
|
+
with idempotent upserts on the key, effectively-once per row).
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
from typing import List, Tuple
|
|
13
|
+
|
|
14
|
+
from ..core.change_record import ChangeRecord
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Trail:
|
|
18
|
+
def __init__(self, trail_dir: str) -> None:
|
|
19
|
+
os.makedirs(trail_dir, exist_ok=True)
|
|
20
|
+
self.path = os.path.join(trail_dir, "trail.jsonl")
|
|
21
|
+
|
|
22
|
+
def append(self, records: List[ChangeRecord]) -> int:
|
|
23
|
+
if not records:
|
|
24
|
+
return 0
|
|
25
|
+
with open(self.path, "a", encoding="utf-8") as f:
|
|
26
|
+
for r in records:
|
|
27
|
+
f.write(r.to_json() + "\n")
|
|
28
|
+
f.flush()
|
|
29
|
+
os.fsync(f.fileno())
|
|
30
|
+
return len(records)
|
|
31
|
+
|
|
32
|
+
def line_count(self) -> int:
|
|
33
|
+
if not os.path.exists(self.path):
|
|
34
|
+
return 0
|
|
35
|
+
with open(self.path, "r", encoding="utf-8") as f:
|
|
36
|
+
return sum(1 for _ in f)
|
|
37
|
+
|
|
38
|
+
def read(self, cursor: int) -> Tuple[List[ChangeRecord], int]:
|
|
39
|
+
"""Return records after line ``cursor`` and the new cursor (total lines)."""
|
|
40
|
+
if not os.path.exists(self.path):
|
|
41
|
+
return [], cursor
|
|
42
|
+
out: List[ChangeRecord] = []
|
|
43
|
+
n = 0
|
|
44
|
+
with open(self.path, "r", encoding="utf-8") as f:
|
|
45
|
+
for n, line in enumerate(f, start=1):
|
|
46
|
+
if n <= cursor:
|
|
47
|
+
continue
|
|
48
|
+
line = line.strip()
|
|
49
|
+
if line:
|
|
50
|
+
out.append(ChangeRecord.from_json(line))
|
|
51
|
+
return out, max(cursor, n)
|
|
File without changes
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Split a table into key-range chunks for parallel + resumable load.
|
|
2
|
+
|
|
3
|
+
A chunk is a half-open integer-PK range ``[lo, hi)``. Chunks are deterministic
|
|
4
|
+
for a given source state (derived from ``MIN``/``MAX`` of the PK), so a resumed
|
|
5
|
+
run regenerates the identical ``chunk_id``s and can skip the ones the manifest
|
|
6
|
+
already recorded as loaded. Tables with no single integer PK fall back to one
|
|
7
|
+
whole-table chunk.
|
|
8
|
+
|
|
9
|
+
The same range is rendered for the source (Oracle, quoted/uppercase columns) and
|
|
10
|
+
the target (lowercased unless ``preserve_case``), so the loader can stream the
|
|
11
|
+
chunk from the source and idempotently DELETE the same range on the target.
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from typing import List, Optional
|
|
17
|
+
|
|
18
|
+
from ..core.catalog_model import Table
|
|
19
|
+
from ..core.identifiers import render_ident
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class Chunk:
|
|
24
|
+
table: Table
|
|
25
|
+
chunk_id: str
|
|
26
|
+
pk_col: Optional[str] = None # None => whole-table chunk
|
|
27
|
+
lo: Optional[int] = None # inclusive
|
|
28
|
+
hi: Optional[int] = None # exclusive
|
|
29
|
+
|
|
30
|
+
def source_where(self) -> Optional[str]:
|
|
31
|
+
if self.pk_col is None:
|
|
32
|
+
return None
|
|
33
|
+
return '"{c}" >= {lo} AND "{c}" < {hi}'.format(c=self.pk_col, lo=self.lo, hi=self.hi)
|
|
34
|
+
|
|
35
|
+
def target_where(self, preserve_case: bool = False) -> Optional[str]:
|
|
36
|
+
if self.pk_col is None:
|
|
37
|
+
return None
|
|
38
|
+
# Render through the shared quoter so a reserved/mixed-case PK column
|
|
39
|
+
# (e.g. "order", "User") in the idempotent range DELETE matches the name
|
|
40
|
+
# the DDL/loader created, instead of a bare token that errors or folds.
|
|
41
|
+
c = render_ident(self.pk_col, preserve_case)
|
|
42
|
+
return "{c} >= {lo} AND {c} < {hi}".format(c=c, lo=self.lo, hi=self.hi)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def compute_chunks(source, table: Table, target_chunks: int = 4) -> List[Chunk]: # type: ignore[no-untyped-def]
|
|
46
|
+
"""Return the chunk list for *table*, aiming for ~*target_chunks* pieces."""
|
|
47
|
+
pk = table.primary_key
|
|
48
|
+
if pk and len(pk.columns) == 1 and target_chunks > 1:
|
|
49
|
+
col = pk.columns[0]
|
|
50
|
+
bounds = source.numeric_pk_bounds(table, col)
|
|
51
|
+
if bounds is not None:
|
|
52
|
+
lo, hi = bounds
|
|
53
|
+
span = hi - lo + 1
|
|
54
|
+
n = max(1, min(target_chunks, span))
|
|
55
|
+
step = (span + n - 1) // n # ceil
|
|
56
|
+
chunks: List[Chunk] = []
|
|
57
|
+
start = lo
|
|
58
|
+
ordinal = 0
|
|
59
|
+
while start <= hi:
|
|
60
|
+
end = min(start + step, hi + 1) # exclusive; last covers hi
|
|
61
|
+
chunks.append(Chunk(table, "{}:{}".format(table.name, ordinal), col, start, end))
|
|
62
|
+
start = end
|
|
63
|
+
ordinal += 1
|
|
64
|
+
return chunks
|
|
65
|
+
return [Chunk(table, "{}:0".format(table.name))]
|