any2heliosdb 0.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. any2heliosdb/__init__.py +16 -0
  2. any2heliosdb/__main__.py +7 -0
  3. any2heliosdb/assess/__init__.py +28 -0
  4. any2heliosdb/assess/inventory.py +78 -0
  5. any2heliosdb/assess/render.py +161 -0
  6. any2heliosdb/assess/report.py +125 -0
  7. any2heliosdb/cdc/__init__.py +0 -0
  8. any2heliosdb/cdc/engine.py +143 -0
  9. any2heliosdb/cdc/registry.py +76 -0
  10. any2heliosdb/cdc/replicat.py +109 -0
  11. any2heliosdb/cdc/sinks/__init__.py +0 -0
  12. any2heliosdb/cdc/sources/__init__.py +0 -0
  13. any2heliosdb/cdc/sources/mysql_binlog.py +193 -0
  14. any2heliosdb/cdc/sources/oracle_scn.py +43 -0
  15. any2heliosdb/cdc/trail.py +51 -0
  16. any2heliosdb/chunking/__init__.py +0 -0
  17. any2heliosdb/chunking/pk_range.py +65 -0
  18. any2heliosdb/cli.py +489 -0
  19. any2heliosdb/config/__init__.py +0 -0
  20. any2heliosdb/config/model.py +85 -0
  21. any2heliosdb/config/store.py +146 -0
  22. any2heliosdb/config/wizard.py +119 -0
  23. any2heliosdb/constants.py +148 -0
  24. any2heliosdb/core/__init__.py +0 -0
  25. any2heliosdb/core/catalog_model.py +367 -0
  26. any2heliosdb/core/change_record.py +86 -0
  27. any2heliosdb/core/identifiers.py +80 -0
  28. any2heliosdb/core/loader.py +172 -0
  29. any2heliosdb/core/manifest.py +304 -0
  30. any2heliosdb/core/orchestrator.py +333 -0
  31. any2heliosdb/emit/__init__.py +0 -0
  32. any2heliosdb/emit/ddl.py +137 -0
  33. any2heliosdb/emit/mysql_ddl.py +195 -0
  34. any2heliosdb/emit/oracle_ddl.py +80 -0
  35. any2heliosdb/errors.py +51 -0
  36. any2heliosdb/geom/__init__.py +0 -0
  37. any2heliosdb/mcp/__init__.py +45 -0
  38. any2heliosdb/mcp/auth.py +179 -0
  39. any2heliosdb/mcp/protocol.py +141 -0
  40. any2heliosdb/mcp/server.py +222 -0
  41. any2heliosdb/mcp/tools.py +554 -0
  42. any2heliosdb/monitor/__init__.py +12 -0
  43. any2heliosdb/monitor/live.py +240 -0
  44. any2heliosdb/plsql/__init__.py +19 -0
  45. any2heliosdb/plsql/cost.py +47 -0
  46. any2heliosdb/plsql/gap.py +125 -0
  47. any2heliosdb/plsql/rewrite.py +351 -0
  48. any2heliosdb/sources/__init__.py +0 -0
  49. any2heliosdb/sources/base.py +81 -0
  50. any2heliosdb/sources/mssql/__init__.py +0 -0
  51. any2heliosdb/sources/mssql/adapter.py +429 -0
  52. any2heliosdb/sources/mysql/__init__.py +0 -0
  53. any2heliosdb/sources/mysql/adapter.py +237 -0
  54. any2heliosdb/sources/oracle/__init__.py +0 -0
  55. any2heliosdb/sources/oracle/adapter.py +309 -0
  56. any2heliosdb/sources/postgres/__init__.py +0 -0
  57. any2heliosdb/sources/postgres/adapter.py +608 -0
  58. any2heliosdb/target/__init__.py +0 -0
  59. any2heliosdb/target/base.py +196 -0
  60. any2heliosdb/target/capability.py +178 -0
  61. any2heliosdb/target/copy_codec.py +88 -0
  62. any2heliosdb/target/mysql_driver.py +239 -0
  63. any2heliosdb/target/native_driver.py +205 -0
  64. any2heliosdb/target/psycopg_driver.py +288 -0
  65. any2heliosdb/typemap/__init__.py +0 -0
  66. any2heliosdb/typemap/defaults.py +251 -0
  67. any2heliosdb/typemap/registry.py +83 -0
  68. any2heliosdb/validate/__init__.py +17 -0
  69. any2heliosdb/validate/counts.py +56 -0
  70. any2heliosdb/validate/data.py +244 -0
  71. any2heliosdb/validate/model.py +60 -0
  72. any2heliosdb/validate/structure.py +52 -0
  73. any2heliosdb-0.9.1.dist-info/METADATA +319 -0
  74. any2heliosdb-0.9.1.dist-info/RECORD +77 -0
  75. any2heliosdb-0.9.1.dist-info/WHEEL +4 -0
  76. any2heliosdb-0.9.1.dist-info/entry_points.txt +3 -0
  77. any2heliosdb-0.9.1.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,109 @@
1
+ """Replicat: apply captured change records to the target, idempotently.
2
+
3
+ Records are bucketed by table and applied through the target driver's upsert
4
+ (insert/update) and delete-by-key seams, so re-applying the same trail slice is
5
+ a no-op on row state. Tables without a primary key can't be keyed and are
6
+ skipped with a warning.
7
+
8
+ Within a table the trail is an *ordered* stream of explicit I/U/D change
9
+ records, so order is load-bearing: ``DELETE id=7`` then ``INSERT id=7`` must
10
+ leave the row present. The apply therefore walks each table's records in
11
+ arrival order, batching only maximal contiguous runs of the same op class
12
+ (upsert vs delete) so the relative delete/upsert order is preserved while still
13
+ amortizing the round-trips. (This is distinct from :meth:`reconcile_deletes`,
14
+ the snapshot-source key-set diff that has no per-record order to honor.)
15
+ """
16
+ from __future__ import annotations
17
+
18
+ from typing import Dict, Iterator, List, Tuple
19
+
20
+ from ..core.change_record import DELETE, ChangeRecord
21
+
22
+
23
+ def _runs_by_op_class(recs: List[ChangeRecord]) -> Iterator[Tuple[bool, List[ChangeRecord]]]:
24
+ """Split records into maximal contiguous runs of one op class, in order.
25
+
26
+ Yields ``(is_delete, run)``: ``is_delete`` True for a run of DELETEs, False
27
+ for a run of INSERT/UPDATE upserts. Splitting only at the upsert<->delete
28
+ boundary preserves the relative ordering that distinguishes ``D`` then ``I``
29
+ (row ends present) from ``I`` then ``D`` (row ends absent), while letting
30
+ same-class neighbors batch into one driver round-trip.
31
+ """
32
+ run: List[ChangeRecord] = []
33
+ cur_is_delete = False
34
+ for r in recs:
35
+ is_delete = r.op == DELETE
36
+ if run and is_delete != cur_is_delete:
37
+ yield cur_is_delete, run
38
+ run = []
39
+ cur_is_delete = is_delete
40
+ run.append(r)
41
+ if run:
42
+ yield cur_is_delete, run
43
+
44
+
45
+ class Replicat:
46
+ def __init__(self, target, schema_ir, preserve_case: bool = False) -> None: # type: ignore[no-untyped-def]
47
+ self.target = target
48
+ self.preserve_case = preserve_case
49
+ self._by_name = {t.name.upper(): t for t in schema_ir.tables}
50
+
51
+ def _ident(self, name: str) -> str:
52
+ return name if self.preserve_case else name.lower()
53
+
54
+ def apply(self, records: List[ChangeRecord]) -> Tuple[int, List[str]]:
55
+ buckets: Dict[str, List[ChangeRecord]] = {}
56
+ for r in records:
57
+ buckets.setdefault(r.table, []).append(r)
58
+
59
+ applied = 0
60
+ warnings: List[str] = []
61
+ for tname, recs in buckets.items():
62
+ t = self._by_name.get(tname.upper())
63
+ if t is None or not (t.primary_key and t.primary_key.columns):
64
+ warnings.append("{}: no primary key; skipped {} change(s)".format(tname, len(recs)))
65
+ continue
66
+ target_table = t.target_name(self.preserve_case)
67
+ cols = [c.name for c in t.columns]
68
+ tcols = [self._ident(c) for c in cols]
69
+ key_cols = [self._ident(c) for c in t.primary_key.columns]
70
+ pk = t.primary_key.columns
71
+
72
+ # Apply in arrival order, flushing one op class at a time so that an
73
+ # earlier DELETE is never resurrected by a later batched upsert (and
74
+ # vice versa). Contiguous same-class records are coalesced into a
75
+ # single driver call to keep the round-trip count low.
76
+ for is_delete, run in _runs_by_op_class(recs):
77
+ if is_delete:
78
+ keys = [[r.key.get(c) for c in pk] for r in run]
79
+ applied += self.target.delete_keys(target_table, key_cols, keys)
80
+ else:
81
+ rows = [[r.after.get(c) for c in cols] for r in run]
82
+ applied += self.target.upsert(target_table, key_cols, tcols, rows)
83
+ return applied, warnings
84
+
85
+ def reconcile_deletes(self, source_adapter) -> Tuple[int, List[str]]: # type: ignore[no-untyped-def]
86
+ """Delete target rows whose PK is absent from the source's current keys.
87
+
88
+ v1 SCN-watermark capture cannot observe DELETEs (the rows are already
89
+ gone), so the replicat reconciles them with a full key-set diff: keys on
90
+ the target but not in the source are removed. This is a full pass
91
+ (cost O(keys)); incremental delete capture is the log-based roadmap.
92
+ """
93
+ deleted = 0
94
+ warnings: List[str] = []
95
+ for t in self._by_name.values():
96
+ if not (t.primary_key and t.primary_key.columns):
97
+ continue
98
+ pk = t.primary_key.columns
99
+ try:
100
+ src_keys = {tuple(r) for r in source_adapter.stream_rows(t, pk)}
101
+ target_table = t.target_name(self.preserve_case)
102
+ key_idents = [self._ident(c) for c in pk]
103
+ tgt_keys = {tuple(r) for r in self.target.select_keys(target_table, key_idents)}
104
+ extra = [list(k) for k in (tgt_keys - src_keys)]
105
+ if extra:
106
+ deleted += self.target.delete_keys(target_table, key_idents, extra)
107
+ except Exception as e: # noqa: BLE001
108
+ warnings.append("delete reconcile {}: {}".format(t.name, e))
109
+ return deleted, warnings
File without changes
File without changes
@@ -0,0 +1,193 @@
1
+ """MySQL binlog CDC capture (log-based).
2
+
3
+ Reads ROW-format binlog events (`mysql-replication`) and turns them into
4
+ `ChangeRecord`s — real inserts/updates **and deletes**, unlike the SCN-watermark
5
+ source. The capture cursor is the binlog coordinate ``"<file>:<pos>"``; on the
6
+ first cycle it anchors at the server's *current* position (so only changes after
7
+ the baseline load are captured) and returns no records.
8
+
9
+ Requires the source MySQL to have ``log_bin=ON`` and ``binlog_format=ROW``, and
10
+ the connecting user to hold ``REPLICATION SLAVE``/``REPLICATION CLIENT``.
11
+
12
+ Correctness depends on **full** row images and column metadata: with
13
+ ``binlog_row_image=MINIMAL`` an UPDATE only logs the changed columns (the
14
+ replicat would then NULL the omitted ones), and with
15
+ ``binlog_row_metadata=MINIMAL`` events carry ``UNKNOWN_COL0..`` placeholders
16
+ instead of real column names. The source therefore fails closed — it verifies
17
+ ``binlog_format=ROW``, ``binlog_row_metadata=FULL`` and ``binlog_row_image=FULL``
18
+ before anchoring/capture, and rejects any captured event whose after-image does
19
+ not cover the table's columns.
20
+ """
21
+ from __future__ import annotations
22
+
23
+ from typing import Dict, List, Tuple
24
+
25
+ from ...core.change_record import DELETE, INSERT, UPDATE, ChangeRecord
26
+ from ...errors import Any2HeliosError
27
+
28
+ # Server variables that must be FULL/ROW for log-based CDC to be lossless.
29
+ _REQUIRED_VARS = (
30
+ ("binlog_format", "ROW",
31
+ "binlog row events are required; STATEMENT/MIXED don't carry per-row before/after images"),
32
+ ("binlog_row_metadata", "FULL",
33
+ "MINIMAL omits column names, so events expose UNKNOWN_COL0.. instead of real columns"),
34
+ ("binlog_row_image", "FULL",
35
+ "MINIMAL logs only changed columns on UPDATE, so unchanged columns would be written as NULL"),
36
+ )
37
+
38
+
39
+ def _require_full_row_image(settings: Dict[str, str]) -> None:
40
+ """Raise unless ROW binlog with FULL metadata + FULL row image is in effect.
41
+
42
+ ``settings`` maps server-variable name -> value (case-insensitively), as read
43
+ from ``SHOW VARIABLES``. Fails closed with a clear, actionable message naming
44
+ the offending variable rather than letting capture silently corrupt the
45
+ target. Pure (no I/O) so it can be unit-tested with plain dicts.
46
+ """
47
+ norm = {str(k).lower(): ("" if v is None else str(v)) for k, v in settings.items()}
48
+ for var, want, why in _REQUIRED_VARS:
49
+ got = norm.get(var.lower())
50
+ if got is None:
51
+ raise Any2HeliosError(
52
+ "MySQL CDC: could not read '{}' (need {}={}). {}. Grant the connecting "
53
+ "user access to read server variables, or set it server-side.".format(
54
+ var, var, want, why))
55
+ if got.strip().upper() != want:
56
+ raise Any2HeliosError(
57
+ "MySQL CDC requires {}={} but the server reports {}={!r}. {}. Fix it with "
58
+ "`SET GLOBAL {}={}` (needs SYSTEM_VARIABLES_ADMIN; for binlog_format also "
59
+ "restart replication threads) or set it in my.cnf and restart.".format(
60
+ var, want, var, got, why, var, want))
61
+
62
+
63
+ def _check_image_columns(table: str, op: str, image_keys, expected_cols) -> None: # type: ignore[no-untyped-def]
64
+ """Raise if a captured row image is missing columns or carries placeholders.
65
+
66
+ A FULL row image/metadata stream names every column; a MINIMAL one drops
67
+ unchanged columns and/or surfaces ``UNKNOWN_COL0..`` keys. Either case would
68
+ have the replicat write NULLs over real data, so reject the event loudly.
69
+ ``image_keys`` is the dict-key set of the captured value map; ``expected_cols``
70
+ is the table's full column list from the source schema.
71
+ """
72
+ have = set(image_keys)
73
+ bad = sorted(k for k in have if str(k).upper().startswith("UNKNOWN_COL"))
74
+ if bad:
75
+ raise Any2HeliosError(
76
+ "MySQL CDC: {} on {} carried unnamed columns {} — binlog_row_metadata is not "
77
+ "FULL. Set binlog_row_metadata=FULL on the source and re-anchor.".format(
78
+ op, table, bad))
79
+ missing = [c for c in expected_cols if c not in have]
80
+ if missing:
81
+ raise Any2HeliosError(
82
+ "MySQL CDC: {} on {} omitted columns {} from its row image — binlog_row_image "
83
+ "is not FULL (partial images would be written as NULL). Set "
84
+ "binlog_row_image=FULL on the source and re-anchor.".format(op, table, missing))
85
+
86
+
87
+ class MySqlBinlogSource:
88
+ def __init__(self, dsn, schema, tables, server_id: int = 4279): # type: ignore[no-untyped-def]
89
+ self.dsn = dsn
90
+ self.schema = schema
91
+ self.server_id = server_id
92
+ self._pk = {t.name: (list(t.primary_key.columns) if t.primary_key else []) for t in tables}
93
+ self._cols = {t.name: [c.name for c in t.columns] for t in tables}
94
+ self._tables = [t.name for t in tables]
95
+
96
+ def _conn_settings(self) -> dict:
97
+ return {"host": self.dsn.host, "port": int(self.dsn.port),
98
+ "user": self.dsn.user, "passwd": self.dsn.password or ""}
99
+
100
+ @staticmethod
101
+ def _read_row_image_vars(cur) -> Dict[str, str]: # type: ignore[no-untyped-def]
102
+ """Read the binlog_* variables that gate lossless capture into a dict."""
103
+ out: Dict[str, str] = {}
104
+ for var, _want, _why in _REQUIRED_VARS:
105
+ try:
106
+ cur.execute("SHOW VARIABLES LIKE %s", (var,))
107
+ row = cur.fetchone()
108
+ except Exception: # noqa: BLE001
109
+ row = None
110
+ if row:
111
+ # SHOW VARIABLES returns (Variable_name, Value).
112
+ out[str(row[0])] = "" if row[1] is None else str(row[1])
113
+ return out
114
+
115
+ def current_position(self) -> str:
116
+ import pymysql
117
+
118
+ c = pymysql.connect(**{k: v for k, v in self._conn_settings().items() if k != "passwd"},
119
+ password=self.dsn.password or "")
120
+ try:
121
+ cur = c.cursor()
122
+ # Binlog row events only carry column *names* when row metadata is FULL
123
+ # (the default MINIMAL yields UNKNOWN_COL0..). Set it best-effort so
124
+ # events written after this anchor map to real column names. Requires
125
+ # SYSTEM_VARIABLES_ADMIN; if denied, set it server-side (a documented
126
+ # prerequisite alongside log_bin=ON / binlog_format=ROW).
127
+ try:
128
+ cur.execute("SET GLOBAL binlog_row_metadata = FULL")
129
+ except Exception: # noqa: BLE001
130
+ pass
131
+ # Fail closed: anchoring here means every later cycle resumes from this
132
+ # coordinate, so the row-image guarantees must already hold *now*.
133
+ # (binlog_row_image can't be fixed by a SET that only takes effect for
134
+ # new sessions, so verify rather than assume the best-effort SET stuck.)
135
+ _require_full_row_image(self._read_row_image_vars(cur))
136
+ for q in ("SHOW BINARY LOG STATUS", "SHOW MASTER STATUS"): # 8.4 renamed it
137
+ try:
138
+ cur.execute(q)
139
+ row = cur.fetchone()
140
+ if row:
141
+ return "{}:{}".format(row[0], row[1])
142
+ except Exception: # noqa: BLE001
143
+ continue
144
+ return ""
145
+ finally:
146
+ c.close()
147
+
148
+ def capture(self, position: str) -> Tuple[List[ChangeRecord], str]:
149
+ if not position:
150
+ # First cycle: anchor at the current position, capture nothing yet.
151
+ return [], self.current_position()
152
+
153
+ from pymysqlreplication import BinLogStreamReader
154
+ from pymysqlreplication.row_event import (
155
+ DeleteRowsEvent,
156
+ UpdateRowsEvent,
157
+ WriteRowsEvent,
158
+ )
159
+
160
+ log_file, _, log_pos = position.rpartition(":")
161
+ stream = BinLogStreamReader(
162
+ connection_settings=self._conn_settings(), server_id=self.server_id,
163
+ only_schemas=[self.schema], only_tables=self._tables,
164
+ only_events=[WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent],
165
+ log_file=log_file, log_pos=int(log_pos), resume_stream=True, blocking=False)
166
+ records: List[ChangeRecord] = []
167
+ try:
168
+ for ev in stream:
169
+ tbl = ev.table
170
+ pk = self._pk.get(tbl, [])
171
+ expected = self._cols.get(tbl, [])
172
+ for row in ev.rows:
173
+ if isinstance(ev, WriteRowsEvent):
174
+ vals = row["values"]
175
+ _check_image_columns(tbl, "INSERT", vals.keys(), expected)
176
+ records.append(ChangeRecord(op=INSERT, schema=self.schema, table=tbl,
177
+ key={k: vals.get(k) for k in pk}, after=dict(vals)))
178
+ elif isinstance(ev, UpdateRowsEvent):
179
+ vals = row["after_values"]
180
+ _check_image_columns(tbl, "UPDATE", vals.keys(), expected)
181
+ records.append(ChangeRecord(op=UPDATE, schema=self.schema, table=tbl,
182
+ key={k: vals.get(k) for k in pk}, after=dict(vals)))
183
+ elif isinstance(ev, DeleteRowsEvent):
184
+ vals = row["values"]
185
+ # Delete only needs a sound key, but UNKNOWN_COL / missing PK
186
+ # still signals a non-FULL image, so verify the key columns.
187
+ _check_image_columns(tbl, "DELETE", vals.keys(), pk)
188
+ records.append(ChangeRecord(op=DELETE, schema=self.schema, table=tbl,
189
+ key={k: vals.get(k) for k in pk}, after={}))
190
+ new_pos = "{}:{}".format(stream.log_file, stream.log_pos)
191
+ finally:
192
+ stream.close()
193
+ return records, new_pos
@@ -0,0 +1,43 @@
1
+ """v1 capture: Oracle SCN-watermark.
2
+
3
+ Captures rows whose ``ORA_ROWSCN`` exceeds the extract's watermark (a full
4
+ snapshot on the first cycle, when the watermark is 0) and emits them as upsert
5
+ change records. ``ORA_ROWSCN`` is block-granular without ``ROWDEPENDENCIES``, so
6
+ this may re-emit unchanged neighbours — harmless because the sink upserts on the
7
+ key. Deletes are not visible to a watermark scan; they are a log-based (v2)
8
+ concern. This is the guaranteed-portable Oracle "CDC" for shops without
9
+ LogMiner/supplemental-logging access.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ from typing import List, Tuple
14
+
15
+ from ..registry import Extract # noqa: F401 (type reference for callers)
16
+ from ...core.change_record import UPDATE, ChangeRecord
17
+
18
+
19
+ class OracleScnSource:
20
+ def __init__(self, adapter, schema, tables) -> None: # type: ignore[no-untyped-def]
21
+ self.adapter = adapter
22
+ self.schema = schema
23
+ self.tables = tables
24
+
25
+ def capture(self, since_scn: int) -> Tuple[List[ChangeRecord], int, List[str]]:
26
+ # Anchor the new watermark *before* scanning, so concurrent commits during
27
+ # the scan are picked up next cycle rather than skipped.
28
+ start_scn = self.adapter.current_scn()
29
+ records: List[ChangeRecord] = []
30
+ skipped: List[str] = []
31
+ for t in self.tables:
32
+ if not (t.primary_key and t.primary_key.columns):
33
+ skipped.append(t.name)
34
+ continue
35
+ cols = [c.name for c in t.columns]
36
+ where = None if since_scn <= 0 else "ORA_ROWSCN > {}".format(int(since_scn))
37
+ for row in self.adapter.stream_rows(t, cols, where=where):
38
+ after = {col: row[i] for i, col in enumerate(cols)}
39
+ key = {pk: after[pk] for pk in t.primary_key.columns}
40
+ records.append(ChangeRecord(op=UPDATE, schema=self.schema, table=t.name,
41
+ key=key, after=after, scn=start_scn))
42
+ new_watermark = start_scn if start_scn > 0 else since_scn
43
+ return records, new_watermark, skipped
@@ -0,0 +1,51 @@
1
+ """Durable append-only change trail (one file per extract).
2
+
3
+ Records are appended as JSON lines and fsync'd before the append returns, so a
4
+ committed record survives a crash. The reader is a simple line cursor: reading
5
+ from cursor N returns every record after line N and the new line count, which
6
+ the replicat persists only *after* a successful apply (at-least-once; combined
7
+ with idempotent upserts on the key, effectively-once per row).
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import os
12
+ from typing import List, Tuple
13
+
14
+ from ..core.change_record import ChangeRecord
15
+
16
+
17
+ class Trail:
18
+ def __init__(self, trail_dir: str) -> None:
19
+ os.makedirs(trail_dir, exist_ok=True)
20
+ self.path = os.path.join(trail_dir, "trail.jsonl")
21
+
22
+ def append(self, records: List[ChangeRecord]) -> int:
23
+ if not records:
24
+ return 0
25
+ with open(self.path, "a", encoding="utf-8") as f:
26
+ for r in records:
27
+ f.write(r.to_json() + "\n")
28
+ f.flush()
29
+ os.fsync(f.fileno())
30
+ return len(records)
31
+
32
+ def line_count(self) -> int:
33
+ if not os.path.exists(self.path):
34
+ return 0
35
+ with open(self.path, "r", encoding="utf-8") as f:
36
+ return sum(1 for _ in f)
37
+
38
+ def read(self, cursor: int) -> Tuple[List[ChangeRecord], int]:
39
+ """Return records after line ``cursor`` and the new cursor (total lines)."""
40
+ if not os.path.exists(self.path):
41
+ return [], cursor
42
+ out: List[ChangeRecord] = []
43
+ n = 0
44
+ with open(self.path, "r", encoding="utf-8") as f:
45
+ for n, line in enumerate(f, start=1):
46
+ if n <= cursor:
47
+ continue
48
+ line = line.strip()
49
+ if line:
50
+ out.append(ChangeRecord.from_json(line))
51
+ return out, max(cursor, n)
File without changes
@@ -0,0 +1,65 @@
1
+ """Split a table into key-range chunks for parallel + resumable load.
2
+
3
+ A chunk is a half-open integer-PK range ``[lo, hi)``. Chunks are deterministic
4
+ for a given source state (derived from ``MIN``/``MAX`` of the PK), so a resumed
5
+ run regenerates the identical ``chunk_id``s and can skip the ones the manifest
6
+ already recorded as loaded. Tables with no single integer PK fall back to one
7
+ whole-table chunk.
8
+
9
+ The same range is rendered for the source (Oracle, quoted/uppercase columns) and
10
+ the target (lowercased unless ``preserve_case``), so the loader can stream the
11
+ chunk from the source and idempotently DELETE the same range on the target.
12
+ """
13
+ from __future__ import annotations
14
+
15
+ from dataclasses import dataclass
16
+ from typing import List, Optional
17
+
18
+ from ..core.catalog_model import Table
19
+ from ..core.identifiers import render_ident
20
+
21
+
22
+ @dataclass
23
+ class Chunk:
24
+ table: Table
25
+ chunk_id: str
26
+ pk_col: Optional[str] = None # None => whole-table chunk
27
+ lo: Optional[int] = None # inclusive
28
+ hi: Optional[int] = None # exclusive
29
+
30
+ def source_where(self) -> Optional[str]:
31
+ if self.pk_col is None:
32
+ return None
33
+ return '"{c}" >= {lo} AND "{c}" < {hi}'.format(c=self.pk_col, lo=self.lo, hi=self.hi)
34
+
35
+ def target_where(self, preserve_case: bool = False) -> Optional[str]:
36
+ if self.pk_col is None:
37
+ return None
38
+ # Render through the shared quoter so a reserved/mixed-case PK column
39
+ # (e.g. "order", "User") in the idempotent range DELETE matches the name
40
+ # the DDL/loader created, instead of a bare token that errors or folds.
41
+ c = render_ident(self.pk_col, preserve_case)
42
+ return "{c} >= {lo} AND {c} < {hi}".format(c=c, lo=self.lo, hi=self.hi)
43
+
44
+
45
+ def compute_chunks(source, table: Table, target_chunks: int = 4) -> List[Chunk]: # type: ignore[no-untyped-def]
46
+ """Return the chunk list for *table*, aiming for ~*target_chunks* pieces."""
47
+ pk = table.primary_key
48
+ if pk and len(pk.columns) == 1 and target_chunks > 1:
49
+ col = pk.columns[0]
50
+ bounds = source.numeric_pk_bounds(table, col)
51
+ if bounds is not None:
52
+ lo, hi = bounds
53
+ span = hi - lo + 1
54
+ n = max(1, min(target_chunks, span))
55
+ step = (span + n - 1) // n # ceil
56
+ chunks: List[Chunk] = []
57
+ start = lo
58
+ ordinal = 0
59
+ while start <= hi:
60
+ end = min(start + step, hi + 1) # exclusive; last covers hi
61
+ chunks.append(Chunk(table, "{}:{}".format(table.name, ordinal), col, start, end))
62
+ start = end
63
+ ordinal += 1
64
+ return chunks
65
+ return [Chunk(table, "{}:0".format(table.name))]