pgsync 6.0.0__tar.gz → 6.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. {pgsync-6.0.0 → pgsync-6.1.0}/PKG-INFO +7 -7
  2. {pgsync-6.0.0 → pgsync-6.1.0}/README.md +32 -29
  3. {pgsync-6.0.0 → pgsync-6.1.0}/README.rst +1 -1
  4. {pgsync-6.0.0 → pgsync-6.1.0}/pgsync/__init__.py +1 -1
  5. {pgsync-6.0.0 → pgsync-6.1.0}/pgsync/base.py +18 -0
  6. {pgsync-6.0.0 → pgsync-6.1.0}/pgsync/helper.py +1 -1
  7. {pgsync-6.0.0 → pgsync-6.1.0}/pgsync/sync.py +286 -133
  8. {pgsync-6.0.0 → pgsync-6.1.0}/pgsync/utils.py +4 -1
  9. {pgsync-6.0.0 → pgsync-6.1.0}/pgsync.egg-info/PKG-INFO +7 -7
  10. {pgsync-6.0.0 → pgsync-6.1.0}/pgsync.egg-info/requires.txt +5 -5
  11. {pgsync-6.0.0 → pgsync-6.1.0}/tests/test_sync.py +8 -5
  12. {pgsync-6.0.0 → pgsync-6.1.0}/AUTHORS.rst +0 -0
  13. {pgsync-6.0.0 → pgsync-6.1.0}/CONTRIBUTING.rst +0 -0
  14. {pgsync-6.0.0 → pgsync-6.1.0}/HISTORY.rst +0 -0
  15. {pgsync-6.0.0 → pgsync-6.1.0}/LICENSE +0 -0
  16. {pgsync-6.0.0 → pgsync-6.1.0}/MANIFEST.in +0 -0
  17. {pgsync-6.0.0 → pgsync-6.1.0}/bin/bootstrap +0 -0
  18. {pgsync-6.0.0 → pgsync-6.1.0}/bin/parallel_sync +0 -0
  19. {pgsync-6.0.0 → pgsync-6.1.0}/bin/pgsync +0 -0
  20. {pgsync-6.0.0 → pgsync-6.1.0}/docs/Makefile +0 -0
  21. {pgsync-6.0.0 → pgsync-6.1.0}/docs/authors.rst +0 -0
  22. {pgsync-6.0.0 → pgsync-6.1.0}/docs/changelog.rst +0 -0
  23. {pgsync-6.0.0 → pgsync-6.1.0}/docs/conf.py +0 -0
  24. {pgsync-6.0.0 → pgsync-6.1.0}/docs/contributing.rst +0 -0
  25. {pgsync-6.0.0 → pgsync-6.1.0}/docs/history.rst +0 -0
  26. {pgsync-6.0.0 → pgsync-6.1.0}/docs/index.rst +0 -0
  27. {pgsync-6.0.0 → pgsync-6.1.0}/docs/installation.rst +0 -0
  28. {pgsync-6.0.0 → pgsync-6.1.0}/docs/logo.png +0 -0
  29. {pgsync-6.0.0 → pgsync-6.1.0}/docs/make.bat +0 -0
  30. {pgsync-6.0.0 → pgsync-6.1.0}/docs/readme.rst +0 -0
  31. {pgsync-6.0.0 → pgsync-6.1.0}/docs/usage.rst +0 -0
  32. {pgsync-6.0.0 → pgsync-6.1.0}/pgsync/constants.py +0 -0
  33. {pgsync-6.0.0 → pgsync-6.1.0}/pgsync/exc.py +0 -0
  34. {pgsync-6.0.0 → pgsync-6.1.0}/pgsync/node.py +0 -0
  35. {pgsync-6.0.0 → pgsync-6.1.0}/pgsync/plugin.py +0 -0
  36. {pgsync-6.0.0 → pgsync-6.1.0}/pgsync/querybuilder.py +0 -0
  37. {pgsync-6.0.0 → pgsync-6.1.0}/pgsync/redisqueue.py +0 -0
  38. {pgsync-6.0.0 → pgsync-6.1.0}/pgsync/search_client.py +0 -0
  39. {pgsync-6.0.0 → pgsync-6.1.0}/pgsync/settings.py +0 -0
  40. {pgsync-6.0.0 → pgsync-6.1.0}/pgsync/singleton.py +0 -0
  41. {pgsync-6.0.0 → pgsync-6.1.0}/pgsync/transform.py +0 -0
  42. {pgsync-6.0.0 → pgsync-6.1.0}/pgsync/trigger.py +0 -0
  43. {pgsync-6.0.0 → pgsync-6.1.0}/pgsync/urls.py +0 -0
  44. {pgsync-6.0.0 → pgsync-6.1.0}/pgsync/view.py +0 -0
  45. {pgsync-6.0.0 → pgsync-6.1.0}/pgsync.egg-info/SOURCES.txt +0 -0
  46. {pgsync-6.0.0 → pgsync-6.1.0}/pgsync.egg-info/dependency_links.txt +0 -0
  47. {pgsync-6.0.0 → pgsync-6.1.0}/pgsync.egg-info/not-zip-safe +0 -0
  48. {pgsync-6.0.0 → pgsync-6.1.0}/pgsync.egg-info/top_level.txt +0 -0
  49. {pgsync-6.0.0 → pgsync-6.1.0}/pyproject.toml +0 -0
  50. {pgsync-6.0.0 → pgsync-6.1.0}/setup.cfg +0 -0
  51. {pgsync-6.0.0 → pgsync-6.1.0}/setup.py +0 -0
  52. {pgsync-6.0.0 → pgsync-6.1.0}/tests/__init__.py +0 -0
  53. {pgsync-6.0.0 → pgsync-6.1.0}/tests/conftest.py +0 -0
  54. {pgsync-6.0.0 → pgsync-6.1.0}/tests/fixtures/schema.json +0 -0
  55. {pgsync-6.0.0 → pgsync-6.1.0}/tests/test_base.py +0 -0
  56. {pgsync-6.0.0 → pgsync-6.1.0}/tests/test_constants.py +0 -0
  57. {pgsync-6.0.0 → pgsync-6.1.0}/tests/test_env_vars.py +0 -0
  58. {pgsync-6.0.0 → pgsync-6.1.0}/tests/test_helper.py +0 -0
  59. {pgsync-6.0.0 → pgsync-6.1.0}/tests/test_log_handlers.py +0 -0
  60. {pgsync-6.0.0 → pgsync-6.1.0}/tests/test_node.py +0 -0
  61. {pgsync-6.0.0 → pgsync-6.1.0}/tests/test_query_builder.py +0 -0
  62. {pgsync-6.0.0 → pgsync-6.1.0}/tests/test_redisqueue.py +0 -0
  63. {pgsync-6.0.0 → pgsync-6.1.0}/tests/test_search_client.py +0 -0
  64. {pgsync-6.0.0 → pgsync-6.1.0}/tests/test_settings.py +0 -0
  65. {pgsync-6.0.0 → pgsync-6.1.0}/tests/test_sync_nested_children.py +0 -0
  66. {pgsync-6.0.0 → pgsync-6.1.0}/tests/test_sync_root.py +0 -0
  67. {pgsync-6.0.0 → pgsync-6.1.0}/tests/test_sync_single_child_fk_on_child.py +0 -0
  68. {pgsync-6.0.0 → pgsync-6.1.0}/tests/test_sync_single_child_fk_on_parent.py +0 -0
  69. {pgsync-6.0.0 → pgsync-6.1.0}/tests/test_transform.py +0 -0
  70. {pgsync-6.0.0 → pgsync-6.1.0}/tests/test_trigger.py +0 -0
  71. {pgsync-6.0.0 → pgsync-6.1.0}/tests/test_unique_behaviour.py +0 -0
  72. {pgsync-6.0.0 → pgsync-6.1.0}/tests/test_urls.py +0 -0
  73. {pgsync-6.0.0 → pgsync-6.1.0}/tests/test_utils.py +0 -0
  74. {pgsync-6.0.0 → pgsync-6.1.0}/tests/test_view.py +0 -0
  75. {pgsync-6.0.0 → pgsync-6.1.0}/tests/testing_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pgsync
3
- Version: 6.0.0
3
+ Version: 6.1.0
4
4
  Summary: Postgres/MySQL/MariaDB to Elasticsearch/OpenSearch sync
5
5
  Home-page: https://github.com/toluaina/pgsync
6
6
  Author: Tolu Aina
@@ -33,9 +33,9 @@ License-File: LICENSE
33
33
  License-File: AUTHORS.rst
34
34
  Requires-Dist: async-timeout==5.0.1
35
35
  Requires-Dist: backports-datetime-fromisoformat==2.0.3
36
- Requires-Dist: boto3==1.40.64
37
- Requires-Dist: botocore==1.40.64
38
- Requires-Dist: certifi==2025.10.5
36
+ Requires-Dist: boto3==1.41.2
37
+ Requires-Dist: botocore==1.41.2
38
+ Requires-Dist: certifi==2025.11.12
39
39
  Requires-Dist: charset-normalizer==3.4.4
40
40
  Requires-Dist: click==8.1.8
41
41
  Requires-Dist: elastic-transport==9.1.0
@@ -46,7 +46,7 @@ Requires-Dist: events==0.5
46
46
  Requires-Dist: idna==3.11
47
47
  Requires-Dist: jmespath==1.0.1
48
48
  Requires-Dist: marshmallow==4.0.1
49
- Requires-Dist: mysql-replication==1.0.9
49
+ Requires-Dist: mysql-replication==1.0.12
50
50
  Requires-Dist: opensearch-dsl==2.1.0
51
51
  Requires-Dist: opensearch-py==3.0.0
52
52
  Requires-Dist: packaging==25.0
@@ -57,7 +57,7 @@ Requires-Dist: python-dotenv==1.2.1
57
57
  Requires-Dist: redis==7.0.1
58
58
  Requires-Dist: requests==2.32.5
59
59
  Requires-Dist: requests-aws4auth==1.3.1
60
- Requires-Dist: s3transfer==0.14.0
60
+ Requires-Dist: s3transfer==0.15.0
61
61
  Requires-Dist: six==1.17.0
62
62
  Requires-Dist: sqlalchemy==2.0.44
63
63
  Requires-Dist: sqlparse==0.5.3
@@ -79,7 +79,7 @@ Dynamic: requires-dist
79
79
  Dynamic: requires-python
80
80
  Dynamic: summary
81
81
 
82
- # PostgreSQL to Elasticsearch/OpenSearch sync
82
+ # PostgreSQL/MySQL/MariaDB to Elasticsearch/OpenSearch sync
83
83
 
84
84
 
85
85
  - [PGSync](https://pgsync.com) is a middleware for syncing data from [Postgres](https://www.postgresql.org) to [Elasticsearch](https://www.elastic.co/products/elastic-stack)/[OpenSearch](https://opensearch.org/) or [OpenSearch](https://opensearch.org/).
@@ -73,7 +73,7 @@ Of course, if your data never changed, then you could just take a snapshot in ti
73
73
  PGSync is appropriate for you if:
74
74
  - [Postgres](https://www.postgresql.org) or [MySQL](https://www.mysql.com/) or [MariaDB](https://mariadb.org/) is your read/write source of truth whilst [Elasticsearch](https://www.elastic.co/products/elastic-stack)/[OpenSearch](https://opensearch.org/) is your
75
75
  read-only search layer.
76
- - You need to denormalize relational data into a NoSQL data source.
76
+ - You need to denormalize relational data into a NoSQL data source like [Elasticsearch](https://www.elastic.co/products/elastic-stack)/[OpenSearch](https://opensearch.org/).
77
77
  - Your data is constantly changing.
78
78
  - You have existing data in a relational database such as [Postgres](https://www.postgresql.org) or [MySQL](https://www.mysql.com/) or [MariaDB](https://mariadb.org/) and you need a secondary NoSQL database like [Elasticsearch](https://www.elastic.co/products/elastic-stack)/[OpenSearch](https://opensearch.org/) for text-based queries or autocomplete queries to mirror the existing data without having your application perform dual writes.
79
79
  - You want to keep your existing data untouched whilst taking advantage of
@@ -99,7 +99,8 @@ PGSync operates in an event-driven model by creating triggers for tables in your
99
99
 
100
100
  *This is the only time PGSync will ever make any changes to your database.*
101
101
 
102
- **NOTE**: **If you change the structure of your PGSync's schema config, you would need to rebuild your Elasticsearch/OpenSearch indices.**
102
+ >**NOTE**: **If you change the structure of your PGSync schema config, it's recommended and in most cases necessary to rebuild your Elasticsearch/OpenSearch indices.**
103
+
103
104
  There are plans to support zero-downtime migrations to streamline this process.
104
105
 
105
106
 
@@ -187,7 +188,6 @@ Environment variable placeholders - full list [here](https://pgsync.com/env-vars
187
188
  ### MySQL / MariaDB setup
188
189
 
189
190
  - Enable binary logging in your MySQL / MariaDB setting.
190
-
191
191
  - You also need to set up the following parameters in your MySQL / MariaDB config my.cnf, then restart the database server.
192
192
 
193
193
  ```server-id = 1``` # any non-zero unique ID
@@ -195,10 +195,8 @@ Environment variable placeholders - full list [here](https://pgsync.com/env-vars
195
195
  ```log_bin = mysql-bin```
196
196
 
197
197
  ```binlog_row_image = FULL``` # recommended; if not supported on older MariaDB, omit
198
-
199
198
  - optional housekeeping:
200
199
  ```binlog_expire_logs_seconds = 604800``` # 7 days
201
-
202
200
  - You need to create a replication user with REPLICATION SLAVE and REPLICATION CLIENT privileges
203
201
 
204
202
  ```sql
@@ -224,17 +222,19 @@ Environment variable placeholders - full list [here](https://pgsync.com/env-vars
224
222
 
225
223
  Key features of PGSync are:
226
224
 
227
- - Easily denormalize relational data.
228
- - Works with any PostgreSQL database (version 9.6 or later).
229
- - Negligible impact on database performance.
230
- - Transactionally consistent output in Elasticsearch/OpenSearch. This means: writes appear only when they are committed to the database, insert, update and delete operations appear in the same order as they were committed (as opposed to eventual consistency).
231
- - Fault-tolerant: does not lose data, even if processes crash or a network interruption occurs, etc. The process can be recovered from the last checkpoint.
232
- - Returns the data directly as Postgres/MySQL/MariaDB JSON from the database for speed.
233
- - Supports composite primary and foreign keys.
234
- - Supports Views and Materialized views.
235
- - Supports an arbitrary depth of nested entities i.e Tables having long chain of relationship dependencies.
236
- - Supports PostgreSQL/MySQL/MariaDB JSON data fields. This means: we can extract JSON fields in a database table as a separate field in the resulting document.
237
- - Customizable document structure.
225
+ - Easily denormalize relational data
226
+ - Works with any PostgreSQL database (9.6 or later)
227
+ - Negligible impact on database performance
228
+ - Transactionally consistent output in Elasticsearch/OpenSearch:
229
+ - Writes appear only after they’re committed
230
+ - Inserts, updates, and deletes appear in commit order (not eventually)
231
+ - Fault-tolerant: no data loss even on crashes or network issues; processing resumes from the last checkpoint
232
+ - Returns data directly as PostgreSQL/MySQL/MariaDB JSON for speed
233
+ - Supports composite primary and foreign keys
234
+ - Supports views and materialized views
235
+ - Handles arbitrarily deep nesting of related tables
236
+ - Supports PostgreSQL/MySQL/MariaDB JSON fields, allowing JSON properties to be extracted as separate document fields
237
+ - Customizable document structure
238
238
 
239
239
 
240
240
  #### Requirements
@@ -360,23 +360,26 @@ e.g
360
360
  }
361
361
  ```
362
362
 
363
- PGSync addresses the following challenges:
364
- - What if we update the author's name in the database?
365
- - What if we wanted to add another author for an existing book?
366
- - What if we have lots of documents already with the same author we wanted to change the author name?
367
- - What if we delete or update an author?
368
- - What if we truncate an entire table?
363
+ PGSync addresses common data consistency challenges, such as:
364
+
365
+ - Updating an author's name in the database
366
+ - Adding an additional author to an existing book
367
+ - Changing an author's name across many existing documents
368
+ - Deleting or updating an author record
369
+ - Truncating an entire table and keeping indexes in sync
369
370
 
370
371
 
371
372
  #### Benefits
372
373
 
373
- - PGSync is a simple to use out of the box solution for Change data capture.
374
- - PGSync handles data deletions.
375
- - PGSync requires little development effort. You simply define a schema config describing your data.
376
- - PGSync generates advanced queries matching your schema directly.
377
- - PGSync allows you to easily rebuild your indexes in case of a schema change.
378
- - You can expose only the data you require in Elasticsearch/OpenSearch.
379
- - Supports multiple Postgres/MySQL/MariaDB schemas for multi-tennant applications.
374
+ PGSync is a simple, out-of-the-box solution for change data capture, designed to minimize development effort and keep your search indexes in sync.
375
+
376
+ - Handles data deletions automatically.
377
+ - Requires minimal setup. Just define a schema config that describes your data.
378
+ - Generates advanced queries directly from your schema.
379
+ - Makes it easy to rebuild indexes after schema changes.
380
+ - Lets you expose only the data you need in Elasticsearch/OpenSearch.
381
+ - Supports multiple Postgres/MySQL/MariaDB schemas for multi-tenant applications.
382
+
380
383
 
381
384
 
382
385
  #### Contributing
@@ -1,4 +1,4 @@
1
- # PostgreSQL to Elasticsearch/OpenSearch sync
1
+ # PostgreSQL/MySQL/MariaDB to Elasticsearch/OpenSearch sync
2
2
 
3
3
 
4
4
  - [PGSync](https://pgsync.com) is a middleware for syncing data from [Postgres](https://www.postgresql.org) to [Elasticsearch](https://www.elastic.co/products/elastic-stack)/[OpenSearch](https://opensearch.org/) or [OpenSearch](https://opensearch.org/).
@@ -2,4 +2,4 @@
2
2
 
3
3
  __author__ = "Tolu Aina"
4
4
  __email__ = "tolu@pgsync.com"
5
- __version__ = "6.0.0"
5
+ __version__ = "6.1.0"
@@ -8,8 +8,11 @@ import time
8
8
  import typing as t
9
9
  from contextlib import contextmanager
10
10
 
11
+ import psycopg2
11
12
  import sqlalchemy as sa
13
+ from psycopg2.extras import LogicalReplicationConnection
12
14
  from sqlalchemy.dialects import postgresql # noqa
15
+ from sqlalchemy.engine.url import make_url
13
16
  from sqlalchemy.orm import sessionmaker
14
17
 
15
18
  from .constants import (
@@ -746,6 +749,21 @@ class Base(object):
746
749
  )
747
750
  )[0]
748
751
 
752
+ def get_replication_connection(
753
+ self, engine: sa.engine.Engine
754
+ ) -> psycopg2.extensions.connection:
755
+ url: sa.engine.URL = make_url(str(engine.url))
756
+ # Build a libpq-style connection by keyword args
757
+ conn: psycopg2.extensions.connection = psycopg2.connect(
758
+ host=url.host,
759
+ port=url.port or 5432,
760
+ user=url.username,
761
+ password=url.password,
762
+ dbname=url.database,
763
+ connection_factory=LogicalReplicationConnection,
764
+ )
765
+ return conn
766
+
749
767
  def logical_slot_get_changes(
750
768
  self,
751
769
  slot_name: str,
@@ -63,7 +63,7 @@ def teardown(
63
63
  drop_database(sync.database)
64
64
  if drop_index:
65
65
  sync.search_client.teardown(sync.index)
66
- if delete_redis:
66
+ if delete_redis and sync.redis is not None:
67
67
  sync.redis.delete()
68
68
  if delete_checkpoint:
69
69
  try:
@@ -95,6 +95,7 @@ class Sync(Base, metaclass=Singleton):
95
95
  producer: bool = True,
96
96
  consumer: bool = True,
97
97
  bootstrap: bool = False,
98
+ wal: bool = False,
98
99
  **kwargs,
99
100
  ) -> None:
100
101
  """Constructor."""
@@ -119,12 +120,13 @@ class Sync(Base, metaclass=Singleton):
119
120
  self.producer: bool = producer
120
121
  self.consumer: bool = consumer
121
122
  self.num_workers: int = num_workers
122
- self.redis: RedisQueue = RedisQueue(self.__name)
123
+ # Redis not required in wal or polling mode
124
+ self._redis: t.Optional[RedisQueue] = None
123
125
  self.tree: Tree = Tree(
124
126
  self.models, nodes=self.nodes, database=doc["database"]
125
127
  )
126
128
  if bootstrap:
127
- self.setup()
129
+ self.setup(wal, polling)
128
130
 
129
131
  if validate:
130
132
  self.validate(repl_slots=repl_slots, polling=polling)
@@ -137,6 +139,8 @@ class Sync(Base, metaclass=Singleton):
137
139
  self.count: dict = dict(xlog=0, db=0, redis=0)
138
140
  self.tasks: t.List[asyncio.Task] = []
139
141
  self.lock: threading.Lock = threading.Lock()
142
+ # holds Payload objects across multiple consume() calls
143
+ self._buffer: list["Payload"] = []
140
144
 
141
145
  @property
142
146
  def slot_name(self) -> str:
@@ -147,6 +151,16 @@ class Sync(Base, metaclass=Singleton):
147
151
  def checkpoint_file(self) -> str:
148
152
  return os.path.join(settings.CHECKPOINT_PATH, f".{self.__name}")
149
153
 
154
+ @property
155
+ def redis(self) -> t.Optional[RedisQueue]:
156
+ """Return the Redis queue instance."""
157
+ if self._redis is None:
158
+ try:
159
+ self._redis = RedisQueue(self.__name)
160
+ except Exception:
161
+ pass
162
+ return self._redis
163
+
150
164
  def validate(self, repl_slots: bool = True, polling: bool = False) -> None:
151
165
  """Perform all validation right away."""
152
166
 
@@ -161,45 +175,51 @@ class Sync(Base, metaclass=Singleton):
161
175
  if self.index is None:
162
176
  raise ValueError("Index is missing for doc")
163
177
 
164
- if not self.is_mysql_compat:
165
- if not polling:
166
- max_replication_slots: t.Optional[str] = self.pg_settings(
167
- "max_replication_slots"
178
+ # replication slot not needed in polling or mysql
179
+ if not self.is_mysql_compat and not polling:
180
+ max_replication_slots: t.Optional[str] = self.pg_settings(
181
+ "max_replication_slots"
182
+ )
183
+ try:
184
+ if int(max_replication_slots) < 1:
185
+ raise TypeError
186
+ except TypeError:
187
+ raise RuntimeError(
188
+ "Ensure there is at least one replication slot defined "
189
+ "by setting max_replication_slots = 1"
168
190
  )
169
- try:
170
- if int(max_replication_slots) < 1:
171
- raise TypeError
172
- except TypeError:
173
- raise RuntimeError(
174
- "Ensure there is at least one replication slot defined "
175
- "by setting max_replication_slots = 1"
176
- )
177
191
 
178
- wal_level: t.Optional[str] = self.pg_settings("wal_level")
179
- if not wal_level or wal_level.lower() != "logical":
180
- raise RuntimeError(
181
- "Enable logical decoding by setting wal_level = logical"
182
- )
192
+ wal_level: t.Optional[str] = self.pg_settings("wal_level")
193
+ if not wal_level or wal_level.lower() != "logical":
194
+ raise RuntimeError(
195
+ "Enable logical decoding by setting wal_level = logical"
196
+ )
183
197
 
184
- self._can_create_replication_slot("_tmp_")
198
+ self._can_create_replication_slot("_tmp_")
185
199
 
186
- rds_logical_replication: t.Optional[str] = self.pg_settings(
187
- "rds.logical_replication"
188
- )
189
- if (
190
- rds_logical_replication
191
- and rds_logical_replication.lower() == "off"
192
- ):
193
- raise RDSError("rds.logical_replication is not enabled")
200
+ rds_logical_replication: t.Optional[str] = self.pg_settings(
201
+ "rds.logical_replication"
202
+ )
203
+ if (
204
+ rds_logical_replication
205
+ and rds_logical_replication.lower() == "off"
206
+ ):
207
+ raise RDSError("rds.logical_replication is not enabled")
194
208
 
195
- # ensure we have run bootstrap and the replication slot exists
196
- if repl_slots and not self.replication_slots(self.__name):
197
- raise RuntimeError(
198
- f'Replication slot "{self.__name}" does not exist.\n'
199
- f'Make sure you have run the "bootstrap" command.'
200
- )
209
+ # ensure we have run bootstrap and the replication slot exists
210
+ if repl_slots and not self.replication_slots(self.__name):
211
+ raise RuntimeError(
212
+ f'Replication slot "{self.__name}" does not exist.\n'
213
+ f'Make sure you have run the "bootstrap" command.'
214
+ )
201
215
 
202
- if not settings.REDIS_CHECKPOINT:
216
+ if settings.REDIS_CHECKPOINT:
217
+ # ensure Redis is reachable
218
+ try:
219
+ self.redis.ping()
220
+ except Exception as e:
221
+ raise RuntimeError(f"Cannot reach Redis: {e}")
222
+ else:
203
223
  # ensure the checkpoint dirpath is valid
204
224
  if not Path(settings.CHECKPOINT_PATH).exists():
205
225
  raise RuntimeError(
@@ -300,8 +320,12 @@ class Sync(Base, metaclass=Singleton):
300
320
  routing=self.routing,
301
321
  )
302
322
 
303
- def setup(self, no_create: bool = False) -> None:
304
- """Create the database triggers and replication slot."""
323
+ def setup(
324
+ self, no_create: bool = False, wal: bool = False, polling: bool = False
325
+ ) -> None:
326
+ """Create the database triggers and replication slot.
327
+ Generally bootstrap should not require Redis as it is optional in certain cases.
328
+ """
305
329
  if self.is_mysql_compat:
306
330
  raise NotImplementedError(
307
331
  "Setup is not supported for MySQL-family backend (MySQL or MariaDB)"
@@ -318,75 +342,85 @@ class Sync(Base, metaclass=Singleton):
318
342
 
319
343
  self.teardown(drop_view=False)
320
344
 
321
- for schema in self.schemas:
322
- # TODO: move if_not_exists to the function
323
- if if_not_exists or not self.function_exists(schema):
324
-
325
- self.create_function(schema)
326
-
327
- tables: t.Set = set()
328
- # tables with user defined foreign keys
329
- user_defined_fkey_tables: dict = {}
330
- node_columns: dict = {}
345
+ if not polling:
346
+ for schema in self.schemas:
347
+ # TODO: move if_not_exists to the function
348
+ if if_not_exists or not self.function_exists(schema):
349
+
350
+ self.create_function(schema)
351
+
352
+ tables: t.Set = set()
353
+ # tables with user defined foreign keys
354
+ user_defined_fkey_tables: dict = {}
355
+ node_columns: dict = {}
356
+
357
+ for node in self.tree.traverse_breadth_first():
358
+ if node.schema != schema:
359
+ continue
360
+ tables |= set(
361
+ [
362
+ through.table
363
+ for through in node.relationship.throughs
364
+ ]
365
+ )
366
+ tables |= set([node.table])
367
+ # we also need to bootstrap the base tables
368
+ tables |= set(node.base_tables)
369
+ node_columns[node.table] = set(
370
+ [
371
+ re.split(
372
+ rf"\s*({'|'.join(re.escape(op) for op in JSONB_OPERATORS)})\s*",
373
+ c,
374
+ maxsplit=1,
375
+ )[0]
376
+ for c in node.column_names
377
+ ]
378
+ )
379
+ # we want to get both the parent and the child keys here
380
+ # even though only one of them is the foreign_key.
381
+ # this is because we define both in the schema but
382
+ # do not specify which table is the foreign key.
383
+ columns: list = []
384
+ if node.relationship.foreign_key.parent:
385
+ columns.extend(
386
+ node.relationship.foreign_key.parent
387
+ )
388
+ if node.relationship.foreign_key.child:
389
+ columns.extend(node.relationship.foreign_key.child)
390
+ if columns:
391
+ user_defined_fkey_tables.setdefault(
392
+ node.table, set()
393
+ )
394
+ user_defined_fkey_tables[node.table] |= set(
395
+ columns
396
+ )
397
+ if tables:
398
+ if if_not_exists or not self.view_exists(
399
+ MATERIALIZED_VIEW, schema
400
+ ):
401
+ self.create_view(
402
+ self.index,
403
+ schema,
404
+ tables,
405
+ user_defined_fkey_tables,
406
+ node_columns,
407
+ )
331
408
 
332
- for node in self.tree.traverse_breadth_first():
333
- if node.schema != schema:
334
- continue
335
- tables |= set(
336
- [
337
- through.table
338
- for through in node.relationship.throughs
339
- ]
340
- )
341
- tables |= set([node.table])
342
- # we also need to bootstrap the base tables
343
- tables |= set(node.base_tables)
344
- node_columns[node.table] = set(
345
- [
346
- re.split(
347
- rf"\s*({'|'.join(re.escape(op) for op in JSONB_OPERATORS)})\s*",
348
- c,
349
- maxsplit=1,
350
- )[0]
351
- for c in node.column_names
352
- ]
353
- )
354
- # we want to get both the parent and the child keys here
355
- # even though only one of them is the foreign_key.
356
- # this is because we define both in the schema but
357
- # do not specify which table is the foreign key.
358
- columns: list = []
359
- if node.relationship.foreign_key.parent:
360
- columns.extend(node.relationship.foreign_key.parent)
361
- if node.relationship.foreign_key.child:
362
- columns.extend(node.relationship.foreign_key.child)
363
- if columns:
364
- user_defined_fkey_tables.setdefault(node.table, set())
365
- user_defined_fkey_tables[node.table] |= set(columns)
366
- if tables:
367
- if if_not_exists or not self.view_exists(
368
- MATERIALIZED_VIEW, schema
369
- ):
370
- self.create_view(
371
- self.index,
409
+ self.create_triggers(
372
410
  schema,
373
- tables,
374
- user_defined_fkey_tables,
375
- node_columns,
411
+ tables=tables,
412
+ join_queries=join_queries,
413
+ if_not_exists=if_not_exists,
376
414
  )
377
415
 
378
- self.create_triggers(
379
- schema,
380
- tables=tables,
381
- join_queries=join_queries,
382
- if_not_exists=if_not_exists,
383
- )
384
-
385
- if if_not_exists or not self.replication_slots(self.__name):
416
+ if not wal:
417
+ if if_not_exists or not self.replication_slots(self.__name):
386
418
 
387
- self.create_replication_slot(self.__name)
419
+ self.create_replication_slot(self.__name)
388
420
 
389
- def teardown(self, drop_view: bool = True) -> None:
421
+ def teardown(
422
+ self, drop_view: bool = True, polling: bool = False, wal: bool = False
423
+ ) -> None:
390
424
  """Drop the database triggers and replication slot."""
391
425
  if self.is_mysql_compat:
392
426
  raise NotImplementedError(
@@ -405,28 +439,35 @@ class Sync(Base, metaclass=Singleton):
405
439
  f"Checkpoint file not found: {self.checkpoint_file}"
406
440
  )
407
441
 
408
- self.redis.delete()
442
+ try:
443
+ if self._redis is None:
444
+ raise RuntimeError("Redis is not configured.")
445
+ self.redis.delete()
446
+ except Exception as e:
447
+ logger.warning(f"Could not clear Redis checkpoint queue: {e}")
409
448
 
410
- for schema in self.schemas:
411
- tables: t.Set = set()
412
- for node in self.tree.traverse_breadth_first():
413
- tables |= set(
414
- [
415
- through.table
416
- for through in node.relationship.throughs
417
- ]
449
+ if not polling:
450
+ for schema in self.schemas:
451
+ tables: t.Set = set()
452
+ for node in self.tree.traverse_breadth_first():
453
+ tables |= set(
454
+ [
455
+ through.table
456
+ for through in node.relationship.throughs
457
+ ]
458
+ )
459
+ tables |= set([node.table])
460
+ # we also need to teardown the base tables
461
+ tables |= set(node.base_tables)
462
+ self.drop_triggers(
463
+ schema=schema, tables=tables, join_queries=join_queries
418
464
  )
419
- tables |= set([node.table])
420
- # we also need to teardown the base tables
421
- tables |= set(node.base_tables)
422
- self.drop_triggers(
423
- schema=schema, tables=tables, join_queries=join_queries
424
- )
425
- if drop_view:
426
- self.drop_view(schema)
427
- self.drop_function(schema)
465
+ if drop_view:
466
+ self.drop_view(schema)
467
+ self.drop_function(schema)
428
468
 
429
- self.drop_replication_slot(self.__name)
469
+ if not wal:
470
+ self.drop_replication_slot(self.__name)
430
471
 
431
472
  def get_doc_id(self, primary_keys: t.List[str], table: str) -> str:
432
473
  """
@@ -571,7 +612,7 @@ class Sync(Base, metaclass=Singleton):
571
612
  is_mariadb: bool = getattr(conn.dialect, "is_mariadb", False)
572
613
 
573
614
  def _conn_settings_from_engine(engine: sa.Engine) -> dict:
574
- url = engine.url
615
+ url: sa.engine.URL = engine.url
575
616
  return {
576
617
  "host": url.host,
577
618
  "port": int(url.port),
@@ -581,9 +622,9 @@ class Sync(Base, metaclass=Singleton):
581
622
  "autocommit": True,
582
623
  }
583
624
 
584
- base = _conn_settings_from_engine(self.engine)
585
- connection_settings = dict(base) # replication socket
586
- ctl_connection_settings = dict(base)
625
+ base: dict = _conn_settings_from_engine(self.engine)
626
+ connection_settings: dict = dict(base) # replication socket
627
+ ctl_connection_settings: dict = dict(base)
587
628
  ctl_connection_settings["cursorclass"] = (
588
629
  pymysql.cursors.Cursor
589
630
  ) # tuple rows
@@ -607,11 +648,11 @@ class Sync(Base, metaclass=Singleton):
607
648
  freeze_schema=False,
608
649
  )
609
650
 
610
- current = 0
611
- total = None
651
+ current: int = 0
652
+ total: t.Optional[int] = None
612
653
  batch: list = []
613
654
  last_key: t.Optional[tuple[str, str]] = None
614
- batch_limit = limit
655
+ batch_limit: int = limit
615
656
 
616
657
  # Single-save checkpoint snapshot
617
658
  save_file: t.Optional[str] = start_log
@@ -652,7 +693,7 @@ class Sync(Base, metaclass=Singleton):
652
693
  self.engine, schema, table, row.get("values")
653
694
  ),
654
695
  )
655
- key = (payload.tg_op, payload.table)
696
+ key: tuple[str, str] = (payload.tg_op, payload.table)
656
697
  if last_key is None or key == last_key:
657
698
  batch.append(payload)
658
699
  else:
@@ -1836,6 +1877,97 @@ class Sync(Base, metaclass=Singleton):
1836
1877
 
1837
1878
  self._truncate = True
1838
1879
 
1880
+ def _flush_buffer(
1881
+ self,
1882
+ cursor: t.Any,
1883
+ flush_lsn: t.Optional[str] = None,
1884
+ force_ack: bool = False,
1885
+ ) -> None:
1886
+ # If we have buffered docs, send them
1887
+ if self._buffer:
1888
+ logger.info(f"flushing buffer with {len(self._buffer)} docs")
1889
+ docs: list = []
1890
+ for (op, tbl), run in groupby(
1891
+ self._buffer,
1892
+ key=lambda payload: (payload.tg_op, payload.table),
1893
+ ):
1894
+ batch: list = list(run)
1895
+ logger.info(f"bulk group op={op} tbl={tbl} size={len(batch)}")
1896
+ docs.extend(self._payloads(batch))
1897
+
1898
+ if docs:
1899
+ processed: int = len(self._buffer)
1900
+ logger.info(f"sending bulk of {len(docs)} docs")
1901
+ self.search_client.bulk(self.index, docs)
1902
+ self.count["xlog"] += processed
1903
+ logger.info(f"sent bulk of {len(docs)} docs")
1904
+
1905
+ # if caller didn't provide a flush_lsn, then fall back to last buffered row
1906
+ if flush_lsn is None:
1907
+ flush_lsn = self._buffer_last_lsn
1908
+
1909
+ # clear buffer after successful bulk
1910
+ self._buffer.clear()
1911
+ self._buffer_last_lsn = None
1912
+
1913
+ # Even if buffer was empty, we may want to ACK a COMMIT LSN
1914
+ if flush_lsn is not None and (force_ack or not self._buffer):
1915
+ cursor.send_feedback(flush_lsn=flush_lsn, force=True)
1916
+ logger.info(f"sent feedback flush_lsn=P{flush_lsn}")
1917
+
1918
+ def consume(self, message: t.Any) -> None:
1919
+ raw: t.Any = message.payload
1920
+ lsn: t.Optional[str] = message.data_start
1921
+ chunk_size: int = settings.LOGICAL_SLOT_CHUNK_SIZE
1922
+
1923
+ logger.debug(f"[LSN {lsn}] {raw}")
1924
+
1925
+ match = TX_BOUNDARY_RE.match(raw)
1926
+ if match:
1927
+ kind: str = match.group(1).upper()
1928
+ if kind == "COMMIT":
1929
+ # Flush any buffered docs, and ACK this COMMIT LSN
1930
+ self._flush_buffer(
1931
+ cursor=message.cursor,
1932
+ flush_lsn=lsn,
1933
+ force_ack=True, # ACK even if buffer empty
1934
+ )
1935
+ # BEGIN/COMMIT don't include rows by themselves
1936
+ return
1937
+
1938
+ # Not BEGIN/COMMIT -> row change
1939
+ try:
1940
+ payload: Payload = self.parse_logical_slot(raw)
1941
+ except Exception:
1942
+ logger.exception(f"Error parsing row: {raw}")
1943
+ raise
1944
+
1945
+ # Filter by schema
1946
+ if payload.schema not in self.tree.schemas:
1947
+ # we still saw this LSN; it will be ACKed at COMMIT
1948
+ return
1949
+
1950
+ # Buffer across transactions
1951
+ self._buffer.append(payload)
1952
+ self._buffer_last_lsn = lsn
1953
+
1954
+ # Flush when big enough
1955
+ if len(self._buffer) >= chunk_size:
1956
+ self._flush_buffer(message.cursor)
1957
+
1958
+ def wal_consumer(self) -> None:
1959
+ # open a replication‐mode connection
1960
+ conn = self.get_replication_connection(self.engine)
1961
+ cursor = conn.cursor()
1962
+ # start streaming; include XIDs so you see BEGIN/COMMIT markers
1963
+ cursor.start_replication(
1964
+ slot_name=self.__name,
1965
+ options={"include-xids": "1", "skip-empty-xacts": "1"},
1966
+ decode=True, # gets you str instead of bytes
1967
+ )
1968
+ logger.info("Starting logical replication stream (test_decoding)...")
1969
+ cursor.consume_stream(self.consume)
1970
+
1839
1971
  @threaded
1840
1972
  @exception
1841
1973
  def truncate_slots(self) -> None:
@@ -1962,7 +2094,7 @@ class Sync(Base, metaclass=Singleton):
1962
2094
  is_flag=True,
1963
2095
  help="Run as a daemon (Incompatible with --polling)",
1964
2096
  cls=MutuallyExclusiveOption,
1965
- mutually_exclusive=["polling"],
2097
+ mutually_exclusive=["polling", "wal"],
1966
2098
  )
1967
2099
  @click.option(
1968
2100
  "--producer",
@@ -1985,7 +2117,19 @@ class Sync(Base, metaclass=Singleton):
1985
2117
  is_flag=True,
1986
2118
  help="Polling mode (Incompatible with -d)",
1987
2119
  cls=MutuallyExclusiveOption,
1988
- mutually_exclusive=["daemon"],
2120
+ mutually_exclusive=["daemon", "wal"],
2121
+ )
2122
+ @click.option(
2123
+ "--wal",
2124
+ "-w",
2125
+ is_flag=True,
2126
+ default=False,
2127
+ help="Use WAL for replication",
2128
+ cls=MutuallyExclusiveOption,
2129
+ mutually_exclusive=[
2130
+ "daemon",
2131
+ "polling",
2132
+ ],
1989
2133
  )
1990
2134
  @click.option("--host", "-h", help="PG_HOST override")
1991
2135
  @click.option("--password", is_flag=True, help="Prompt for database password")
@@ -2066,6 +2210,7 @@ def main(
2066
2210
  producer: bool,
2067
2211
  consumer: bool,
2068
2212
  bootstrap: bool,
2213
+ wal: bool,
2069
2214
  ) -> None:
2070
2215
  """Main application syncer."""
2071
2216
  if version:
@@ -2129,17 +2274,25 @@ def main(
2129
2274
  # In polling mode, the app can run without replication slots or triggers.
2130
2275
  # However, this is not the preferred mode of operation.
2131
2276
  # It should be considered a workaround for running on a read-only cluster.
2132
- kwargs["polling"] = True
2133
2277
  while True:
2134
2278
  for doc in config_loader(
2135
2279
  config=config,
2136
2280
  schema_url=schema_url,
2137
2281
  s3_schema_url=s3_schema_url,
2138
2282
  ):
2139
- sync: Sync = Sync(doc, verbose=verbose, **kwargs)
2283
+ sync: Sync = Sync(
2284
+ doc, verbose=verbose, polling=True, **kwargs
2285
+ )
2140
2286
  sync.pull(polling=True)
2141
2287
  time.sleep(settings.POLL_INTERVAL)
2142
-
2288
+ elif wal:
2289
+ for doc in config_loader(
2290
+ config=config,
2291
+ schema_url=schema_url,
2292
+ s3_schema_url=s3_schema_url,
2293
+ ):
2294
+ sync: Sync = Sync(doc, verbose=verbose, wal=True, **kwargs)
2295
+ sync.wal_consumer()
2143
2296
  else:
2144
2297
  tasks: t.List[asyncio.Task] = []
2145
2298
  for doc in config_loader(
@@ -2162,7 +2315,7 @@ def main(
2162
2315
  tasks.extend(sync.tasks)
2163
2316
 
2164
2317
  if settings.USE_ASYNC:
2165
- event_loop = asyncio.get_event_loop()
2318
+ event_loop: asyncio.AbstractEventLoop = asyncio.get_event_loop()
2166
2319
  event_loop.run_until_complete(asyncio.gather(*tasks))
2167
2320
  event_loop.close()
2168
2321
 
@@ -280,7 +280,10 @@ def config_loader(
280
280
 
281
281
  try:
282
282
  with open(config_path, "r") as f:
283
- data = json.load(f)
283
+ try:
284
+ data = json.load(f)
285
+ except json.JSONDecodeError as e:
286
+ raise ValueError(f"{config_path} is not valid JSON: {e}") from e
284
287
  for doc in data:
285
288
  for key, value in doc.items():
286
289
  try:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pgsync
3
- Version: 6.0.0
3
+ Version: 6.1.0
4
4
  Summary: Postgres/MySQL/MariaDB to Elasticsearch/OpenSearch sync
5
5
  Home-page: https://github.com/toluaina/pgsync
6
6
  Author: Tolu Aina
@@ -33,9 +33,9 @@ License-File: LICENSE
33
33
  License-File: AUTHORS.rst
34
34
  Requires-Dist: async-timeout==5.0.1
35
35
  Requires-Dist: backports-datetime-fromisoformat==2.0.3
36
- Requires-Dist: boto3==1.40.64
37
- Requires-Dist: botocore==1.40.64
38
- Requires-Dist: certifi==2025.10.5
36
+ Requires-Dist: boto3==1.41.2
37
+ Requires-Dist: botocore==1.41.2
38
+ Requires-Dist: certifi==2025.11.12
39
39
  Requires-Dist: charset-normalizer==3.4.4
40
40
  Requires-Dist: click==8.1.8
41
41
  Requires-Dist: elastic-transport==9.1.0
@@ -46,7 +46,7 @@ Requires-Dist: events==0.5
46
46
  Requires-Dist: idna==3.11
47
47
  Requires-Dist: jmespath==1.0.1
48
48
  Requires-Dist: marshmallow==4.0.1
49
- Requires-Dist: mysql-replication==1.0.9
49
+ Requires-Dist: mysql-replication==1.0.12
50
50
  Requires-Dist: opensearch-dsl==2.1.0
51
51
  Requires-Dist: opensearch-py==3.0.0
52
52
  Requires-Dist: packaging==25.0
@@ -57,7 +57,7 @@ Requires-Dist: python-dotenv==1.2.1
57
57
  Requires-Dist: redis==7.0.1
58
58
  Requires-Dist: requests==2.32.5
59
59
  Requires-Dist: requests-aws4auth==1.3.1
60
- Requires-Dist: s3transfer==0.14.0
60
+ Requires-Dist: s3transfer==0.15.0
61
61
  Requires-Dist: six==1.17.0
62
62
  Requires-Dist: sqlalchemy==2.0.44
63
63
  Requires-Dist: sqlparse==0.5.3
@@ -79,7 +79,7 @@ Dynamic: requires-dist
79
79
  Dynamic: requires-python
80
80
  Dynamic: summary
81
81
 
82
- # PostgreSQL to Elasticsearch/OpenSearch sync
82
+ # PostgreSQL/MySQL/MariaDB to Elasticsearch/OpenSearch sync
83
83
 
84
84
 
85
85
  - [PGSync](https://pgsync.com) is a middleware for syncing data from [Postgres](https://www.postgresql.org) to [Elasticsearch](https://www.elastic.co/products/elastic-stack)/[OpenSearch](https://opensearch.org/) or [OpenSearch](https://opensearch.org/).
@@ -1,8 +1,8 @@
1
1
  async-timeout==5.0.1
2
2
  backports-datetime-fromisoformat==2.0.3
3
- boto3==1.40.64
4
- botocore==1.40.64
5
- certifi==2025.10.5
3
+ boto3==1.41.2
4
+ botocore==1.41.2
5
+ certifi==2025.11.12
6
6
  charset-normalizer==3.4.4
7
7
  click==8.1.8
8
8
  elastic-transport==9.1.0
@@ -13,7 +13,7 @@ events==0.5
13
13
  idna==3.11
14
14
  jmespath==1.0.1
15
15
  marshmallow==4.0.1
16
- mysql-replication==1.0.9
16
+ mysql-replication==1.0.12
17
17
  opensearch-dsl==2.1.0
18
18
  opensearch-py==3.0.0
19
19
  packaging==25.0
@@ -24,7 +24,7 @@ python-dotenv==1.2.1
24
24
  redis==7.0.1
25
25
  requests==2.32.5
26
26
  requests-aws4auth==1.3.1
27
- s3transfer==0.14.0
27
+ s3transfer==0.15.0
28
28
  six==1.17.0
29
29
  sqlalchemy==2.0.44
30
30
  sqlparse==0.5.3
@@ -937,7 +937,7 @@ class TestSync(object):
937
937
  mock_teardown.assert_called_once_with(drop_view=False)
938
938
 
939
939
  @patch("pgsync.redisqueue.RedisQueue.delete")
940
- def test_teardown(self, mock_redis, sync):
940
+ def test_teardown(self, mock_redis_delete, sync):
941
941
  with override_env_var(JOIN_QUERIES="False"):
942
942
  importlib.reload(settings)
943
943
 
@@ -960,16 +960,19 @@ class TestSync(object):
960
960
  )
961
961
  mock_drop_view.assert_called_once_with("public")
962
962
  mock_drop_function.assert_called_once_with("public")
963
- mock_redis.assert_called_once()
963
+ mock_redis_delete.assert_not_called()
964
964
  assert os.path.exists(sync.checkpoint_file) is False
965
965
 
966
966
  with patch("pgsync.sync.logger") as mock_logger:
967
967
  with patch("pgsync.sync.Base.drop_replication_slot"):
968
968
  self.checkpoint_file = "foo"
969
969
  sync.teardown()
970
- mock_logger.warning.assert_called_once_with(
971
- "Checkpoint file not found: ./.testdb_testdb"
972
- )
970
+ assert mock_logger.warning.call_args_list == [
971
+ call("Checkpoint file not found: ./.testdb_testdb"),
972
+ call(
973
+ "Could not clear Redis checkpoint queue: Redis is not configured."
974
+ ),
975
+ ]
973
976
 
974
977
  def test_root(self, sync):
975
978
  root = sync.tree.root
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes