PyPI - pgsync - Versions diffs - 3.0.0__tar.gz → 3.1.0__tar.gz - Mend

pgsync 3.0.0tar.gz → 3.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

{pgsync-3.0.0 → pgsync-3.1.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: pgsync
-Version: 3.0.0
+Version: 3.1.0
 Summary: Postgres to Elasticsearch/OpenSearch sync
 Home-page: https://github.com/toluaina/pgsync
 Author: Tolu Aina
@@ -21,6 +21,7 @@ Classifier: Programming Language :: Python :: 3.8
 Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: Implementation :: CPython
 Classifier: Programming Language :: Python :: Implementation :: PyPy
 Classifier: License :: OSI Approved :: MIT License
@@ -30,39 +31,33 @@ Description-Content-Type: text/markdown
 License-File: LICENSE
 License-File: AUTHORS.rst
 Requires-Dist: async-timeout==4.0.3
-Requires-Dist: black==23.11.0
-Requires-Dist: boto3==1.29.3
-Requires-Dist: botocore==1.32.3
+Requires-Dist: boto3==1.34.11
+Requires-Dist: botocore==1.34.11
 Requires-Dist: certifi==2023.11.17
 Requires-Dist: charset-normalizer==3.3.2
 Requires-Dist: click==8.1.7
-Requires-Dist: elasticsearch==7.13.4
-Requires-Dist: elasticsearch-dsl==7.4.1
-Requires-Dist: environs==9.5.0
-Requires-Dist: faker==20.0.3
-Requires-Dist: greenlet==3.0.1
-Requires-Dist: idna==3.4
-Requires-Dist: isort==5.12.0
+Requires-Dist: elastic-transport==8.11.0
+Requires-Dist: elasticsearch==8.11.1
+Requires-Dist: elasticsearch-dsl==8.11.0
+Requires-Dist: environs==10.0.0
+Requires-Dist: greenlet==3.0.3
+Requires-Dist: idna==3.6
 Requires-Dist: jmespath==1.0.1
 Requires-Dist: marshmallow==3.20.1
-Requires-Dist: mypy-extensions==1.0.0
 Requires-Dist: opensearch-dsl==2.1.0
-Requires-Dist: opensearch-py==2.4.1
+Requires-Dist: opensearch-py==2.4.2
 Requires-Dist: packaging==23.2
-Requires-Dist: pathspec==0.11.2
-Requires-Dist: platformdirs==4.0.0
 Requires-Dist: psycopg2-binary==2.9.9
 Requires-Dist: python-dateutil==2.8.2
 Requires-Dist: python-dotenv==1.0.0
 Requires-Dist: redis==5.0.1
 Requires-Dist: requests==2.31.0
 Requires-Dist: requests-aws4auth==1.2.3
-Requires-Dist: s3transfer==0.7.0
+Requires-Dist: s3transfer==0.10.0
 Requires-Dist: six==1.16.0
-Requires-Dist: sqlalchemy==1.4.50
+Requires-Dist: sqlalchemy==2.0.25
 Requires-Dist: sqlparse==0.4.4
-Requires-Dist: tomli==2.0.1
-Requires-Dist: typing-extensions==4.8.0
+Requires-Dist: typing-extensions==4.9.0
 Requires-Dist: urllib3==1.26.18
 # PostgreSQL to Elasticsearch/OpenSearch sync

{pgsync-3.0.0 → pgsync-3.1.0}/bin/bootstrap RENAMED Viewed

@@ -56,9 +56,9 @@ def main(teardown, config, user, password, host, port, verbose):
     validate: bool = False if teardown else True
-    for document in config_loader(config):
+    for doc in config_loader(config):
         sync: Sync = Sync(
-            document,
+            doc,
             verbose=verbose,
             validate=validate,
             repl_slots=False,

{pgsync-3.0.0 → pgsync-3.1.0}/bin/parallel_sync RENAMED Viewed

@@ -1,42 +1,42 @@
 #!/usr/bin/env python
 """
-Parallel sync is an innovative, experimental feature designed to optimize
-throughput by utilizing available CPUs/threads, particularly beneficial
+Parallel sync is an innovative, experimental feature designed to optimize
+throughput by utilizing available CPUs/threads, particularly beneficial
 in environments experiencing high network latency.
 Scenario & Challenge:
-In instances where your PG database, Elasticsearch/OpenSearch, and PGSync
-servers operate on divergent networks, a delay in request/response time is
-noticeable. The primary constraint emerges from the database query's roundtrip,
-which even server-side cursors can address only to a limited extent by fetching
-a certain number of records at a time. The consequent delay in fetching the
+In instances where your PG database, Elasticsearch/OpenSearch, and PGSync
+servers operate on divergent networks, a delay in request/response time is
+noticeable. The primary constraint emerges from the database query's roundtrip,
+which even server-side cursors can address only to a limited extent by fetching
+a certain number of records at a time. The consequent delay in fetching the
 next cursor significantly hampers the overall synchronization speed.
 Solution:
-To mitigate this, the strategy is to conduct an initial fast/parallel sync,
-thereby populating Elasticsearch/OpenSearch in a single iteration.
+To mitigate this, the strategy is to conduct an initial fast/parallel sync,
+thereby populating Elasticsearch/OpenSearch in a single iteration.
 Post this, the regular pgsync can continue running as a daemon.
 Approach and Technical Implementation:
-The approach centers around utilizing the Tuple identifier record of the table
-columns. Every table incorporates a system column – "ctid" of type "tid,"
+The approach centers around utilizing the Tuple identifier record of the table
+columns. Every table incorporates a system column – "ctid" of type "tid,"
 which helps identify the page record and the row number in each block.
 This element facilitates the pagination of the sync process.
-Technically, pagination implies dividing each paged record amongst the
-available CPUs/threads. This division enables the parallel execution of
-Elasticsearch/OpenSearch bulk inserts. The "ctid" serves as a tuple
+Technically, pagination implies dividing each paged record amongst the
+available CPUs/threads. This division enables the parallel execution of
+Elasticsearch/OpenSearch bulk inserts. The "ctid" serves as a tuple
 (for instance, (1, 5)), pinpointing the row in a disk page.
-By leveraging this method, all paged row records are retrieved upfront and
-allocated as work units across the worker threads/CPUs.
-Each work unit, defined by the BLOCK_SIZE, denotes the number of root node
+By leveraging this method, all paged row records are retrieved upfront and
+allocated as work units across the worker threads/CPUs.
+Each work unit, defined by the BLOCK_SIZE, denotes the number of root node
 records assigned for each worker to process.
-Subsequently, the workers execute queries for each assigned chunk of work,
-filtered based on the page number and row numbers.
-This systematic and parallel approach optimizes the synchronization process,
+Subsequently, the workers execute queries for each assigned chunk of work,
+filtered based on the page number and row numbers.
+This systematic and parallel approach optimizes the synchronization process,
 especially in environments challenged by network latency.
 """
@@ -45,56 +45,50 @@ import multiprocessing
 import os
 import re
 import sys
+import typing as t
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from queue import Queue
 from threading import Thread
-from typing import Generator, Optional, Union
 import click
 import sqlalchemy as sa
 from pgsync.settings import BLOCK_SIZE, CHECKPOINT_PATH
 from pgsync.sync import Sync
-from pgsync.utils import (
-    compiled_query,
-    config_loader,
-    get_config,
-    show_settings,
-    timeit,
-)
+from pgsync.utils import config_loader, get_config, show_settings, timeit
-def save_ctid(page: int, row: int, name: str) -> None:
+def save_ctid(page: int, row: int, filename: str) -> None:
     """
     Save the checkpoint for a given page and row in a file with the given name.
     Args:
         page (int): The page number to save.
         row (int): The row number to save.
-        name (str): The name of the file to save the checkpoint in.
+        filename (str): The name of the file to save the checkpoint in.
     """
-    checkpoint_file: str = os.path.join(CHECKPOINT_PATH, f".{name}.ctid")
-    with open(checkpoint_file, "w+") as fp:
+    filepath: str = os.path.join(CHECKPOINT_PATH, f".{filename}.ctid")
+    with open(filepath, "w+") as fp:
         fp.write(f"{page},{row}\n")
-def read_ctid(name: str) -> None:
+def read_ctid(filename: str) -> t.Tuple[t.Optional[int], t.Optional[int]]:
     """
     Reads the checkpoint file for the given name and returns the page and row numbers.
     Args:
-        name (str): The name of the checkpoint file.
+        filename (str): The name of the checkpoint file.
     Returns:
         tuple: A tuple containing the page and row numbers. If the checkpoint file does not exist, returns (None, None).
     """
-    checkpoint_file: str = os.path.join(CHECKPOINT_PATH, f".{name}.ctid")
-    if os.path.exists(checkpoint_file):
-        with open(checkpoint_file, "r") as fp:
+    filepath: str = os.path.join(CHECKPOINT_PATH, f".{filename}.ctid")
+    if os.path.exists(filepath):
+        with open(filepath, "r") as fp:
             pairs: str = fp.read().split()[0].split(",")
-            page = int(pairs[0])
-            row = int(pairs[1])
+            page: int = int(pairs[0])
+            row: int = int(pairs[1])
             return page, row
     return None, None
@@ -120,7 +114,6 @@ class Task:
         sync: Sync = Sync(
             self.doc, verbose=self.verbose, validate=self.validate
         )
-        sync.tree.build(sync.nodes)
         txmin: int = sync.checkpoint
         txmax: int = sync.txid_current
         sync.search_client.bulk(
@@ -134,19 +127,19 @@ class Task:
 @timeit
 def fetch_tasks(
     doc: dict,
-    block_size: Optional[int] = None,
-) -> Generator:
+    block_size: t.Optional[int] = None,
+) -> t.Generator:
     block_size = block_size or BLOCK_SIZE
     pages: dict = {}
     sync: Sync = Sync(doc)
-    page: Optional[int] = None
-    row: Optional[int] = None
-    name: str = re.sub(
+    page: t.Optional[int] = None
+    row: t.Optional[int] = None
+    filename: str = re.sub(
         "[^0-9a-zA-Z_]+", "", f"{sync.database.lower()}_{sync.index}"
     )
-    page, row = read_ctid(name=name)
+    page, row = read_ctid(filename)
     statement: sa.sql.Select = sa.select(
-        [
+        *[
             sa.literal_column("1").label("x"),
             sa.literal_column("1").label("y"),
             sa.column("ctid"),
@@ -213,11 +206,13 @@ def fetch_tasks(
 @timeit
 def synchronous(
-    tasks: Generator, doc: dict, verbose: bool = False, validate: bool = False
+    tasks: t.Generator,
+    doc: dict,
+    verbose: bool = False,
+    validate: bool = False,
 ) -> None:
     sys.stdout.write("Synchronous\n")
     sync: Sync = Sync(doc, verbose=verbose, validate=validate)
-    sync.tree.build(sync.nodes)
     txmin: int = sync.checkpoint
     txmax: int = sync.txid_current
     index: str = sync.index
@@ -231,9 +226,9 @@ def synchronous(
 @timeit
 def multithreaded(
-    tasks: Generator,
+    tasks: t.Generator,
     doc: dict,
-    nprocs: Optional[int] = None,
+    nthreads: t.Optional[int] = None,
     verbose: bool = False,
     validate: bool = False,
 ) -> None:
@@ -250,12 +245,11 @@ def multithreaded(
             )
             queue.task_done()
-    nprocs: int = nprocs or 1
+    nthreads: int = nthreads or 1
     queue: Queue = Queue()
     sync: Sync = Sync(doc, verbose=verbose, validate=validate)
-    sync.tree.build(sync.nodes)
-    for _ in range(nprocs):
+    for _ in range(nthreads):
         thread: Thread = Thread(
             target=worker,
             args=(
@@ -274,15 +268,15 @@ def multithreaded(
 @timeit
 def multiprocess(
-    tasks: Generator,
+    tasks: t.Generator,
     doc: dict,
-    nprocs: Optional[int] = None,
+    ncpus: t.Optional[int] = None,
     verbose: bool = False,
     validate: bool = False,
 ) -> None:
     sys.stdout.write("Multiprocess\n")
     task: Task = Task(doc, verbose=verbose, validate=validate)
-    with ProcessPoolExecutor(max_workers=nprocs) as executor:
+    with ProcessPoolExecutor(max_workers=ncpus) as executor:
         try:
             list(executor.map(task.process, tasks))
         except Exception as e:
@@ -292,14 +286,14 @@ def multiprocess(
 @timeit
 def multithreaded_async(
-    tasks: Generator,
+    tasks: t.Generator,
     doc: dict,
-    nprocs: Optional[int] = None,
+    nthreads: t.Optional[int] = None,
     verbose: bool = False,
     validate: bool = False,
 ) -> None:
     sys.stdout.write("Multi-threaded async\n")
-    executor: ThreadPoolExecutor = ThreadPoolExecutor(max_workers=nprocs)
+    executor: ThreadPoolExecutor = ThreadPoolExecutor(max_workers=nthreads)
     event_loop = asyncio.get_event_loop()
     event_loop.run_until_complete(
         run_tasks(executor, tasks, doc, verbose=verbose, validate=validate)
@@ -309,14 +303,14 @@ def multithreaded_async(
 @timeit
 def multiprocess_async(
-    tasks: Generator,
+    tasks: t.Generator,
     doc: dict,
-    nprocs: Optional[int] = None,
+    ncpus: t.Optional[int] = None,
     verbose: bool = False,
     validate: bool = False,
 ) -> None:
     sys.stdout.write("Multi-process async\n")
-    executor: ProcessPoolExecutor = ProcessPoolExecutor(max_workers=nprocs)
+    executor: ProcessPoolExecutor = ProcessPoolExecutor(max_workers=ncpus)
     event_loop = asyncio.get_event_loop()
     try:
         event_loop.run_until_complete(
@@ -328,18 +322,18 @@ def multiprocess_async(
 async def run_tasks(
-    executor: Union[ThreadPoolExecutor, ProcessPoolExecutor],
-    tasks: Generator,
+    executor: t.Union[ThreadPoolExecutor, ProcessPoolExecutor],
+    tasks: t.Generator,
     doc: dict,
     verbose: bool = False,
     validate: bool = False,
 ) -> None:
-    sync: Optional[Sync] = None
+    sync: t.Optional[Sync] = None
     if isinstance(executor, ThreadPoolExecutor):
         # threads can share a common Sync object
         sync = Sync(doc, verbose=verbose, validate=validate)
     event_loop = asyncio.get_event_loop()
-    completed, pending = await asyncio.wait(
+    completed, _ = await asyncio.wait(
         [
             event_loop.run_in_executor(
                 executor, run_task, task, sync, doc, verbose, validate
@@ -354,14 +348,13 @@ async def run_tasks(
 def run_task(
     task: dict,
-    sync: Optional[Sync] = None,
-    doc: Optional[dict] = None,
+    sync: t.Optional[Sync] = None,
+    doc: t.Optional[dict] = None,
     verbose: bool = False,
     validate: bool = False,
 ) -> int:
     if sync is None:
         sync: Sync = Sync(doc, verbose=verbose, validate=validate)
-    sync.tree.build(sync.nodes)
     txmin: int = sync.checkpoint
     txmax: int = sync.txid_current
     sync.search_client.bulk(
@@ -371,10 +364,10 @@ def run_task(
     if len(task) > 0:
         page: int = max(task.keys())
         row: int = max(task[page])
-        name: str = re.sub(
+        filename: str = re.sub(
             "[^0-9a-zA-Z_]+", "", f"{sync.database.lower()}_{sync.index}"
         )
-        save_ctid(page=page, row=row, name=name)
+        save_ctid(page, row, filename)
     return 1
@@ -426,20 +419,18 @@ def main(config, nprocs, mode, verbose):
     show_settings()
     config: str = get_config(config)
-    for document in config_loader(config):
-        tasks: Generator = fetch_tasks(document)
+    for doc in config_loader(config):
+        tasks: t.Generator = fetch_tasks(doc)
         if mode == "synchronous":
-            synchronous(tasks, document, verbose=verbose)
+            synchronous(tasks, doc, verbose=verbose)
         elif mode == "multithreaded":
-            multithreaded(tasks, document, nprocs=nprocs, verbose=verbose)
+            multithreaded(tasks, doc, nthreads=nprocs, verbose=verbose)
         elif mode == "multiprocess":
-            multiprocess(tasks, document, nprocs=nprocs, verbose=verbose)
+            multiprocess(tasks, doc, ncpus=nprocs, verbose=verbose)
         elif mode == "multithreaded_async":
-            multithreaded_async(
-                tasks, document, nprocs=nprocs, verbose=verbose
-            )
+            multithreaded_async(tasks, doc, nthreads=nprocs, verbose=verbose)
         elif mode == "multiprocess_async":
-            multiprocess_async(tasks, document, nprocs=nprocs, verbose=verbose)
+            multiprocess_async(tasks, doc, ncpus=nprocs, verbose=verbose)
 if __name__ == "__main__":

{pgsync-3.0.0 → pgsync-3.1.0}/pgsync/__init__.py RENAMED Viewed

@@ -2,4 +2,4 @@
 __author__ = "Tolu Aina"
 __email__ = "tolu@pgsync.com"
-__version__ = "3.0.0"
+__version__ = "3.1.0"

pgsync 3.0.0__tar.gz → 3.1.0__tar.gz

pgsync 3.0.0tar.gz → 3.1.0tar.gz