pgsync 2.5.0__tar.gz → 3.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. pgsync-3.1.0/LICENSE +21 -0
  2. {pgsync-2.5.0 → pgsync-3.1.0}/PKG-INFO +36 -6
  3. {pgsync-2.5.0 → pgsync-3.1.0}/README.md +5 -5
  4. {pgsync-2.5.0 → pgsync-3.1.0}/README.rst +1 -1
  5. {pgsync-2.5.0 → pgsync-3.1.0}/bin/bootstrap +8 -2
  6. {pgsync-2.5.0 → pgsync-3.1.0}/bin/parallel_sync +104 -97
  7. {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/__init__.py +1 -1
  8. {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/base.py +202 -159
  9. {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/constants.py +14 -1
  10. {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/helper.py +18 -8
  11. {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/node.py +62 -48
  12. {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/plugin.py +16 -5
  13. {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/querybuilder.py +28 -46
  14. {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/redisqueue.py +5 -5
  15. {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/search_client.py +108 -76
  16. {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/settings.py +26 -6
  17. pgsync-3.1.0/pgsync/singleton.py +39 -0
  18. {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/sync.py +137 -100
  19. {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/transform.py +20 -9
  20. {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/trigger.py +7 -1
  21. pgsync-3.1.0/pgsync/urls.py +145 -0
  22. {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/utils.py +77 -26
  23. {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/view.py +215 -44
  24. {pgsync-2.5.0 → pgsync-3.1.0}/pgsync.egg-info/PKG-INFO +36 -6
  25. pgsync-3.1.0/pgsync.egg-info/requires.txt +29 -0
  26. pgsync-3.1.0/pyproject.toml +3 -0
  27. {pgsync-2.5.0 → pgsync-3.1.0}/setup.cfg +1 -1
  28. {pgsync-2.5.0 → pgsync-3.1.0}/setup.py +6 -5
  29. {pgsync-2.5.0 → pgsync-3.1.0}/tests/conftest.py +133 -82
  30. {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_base.py +96 -73
  31. {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_constants.py +1 -0
  32. {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_node.py +24 -21
  33. {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_redisqueue.py +16 -16
  34. {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_search_client.py +5 -9
  35. {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_settings.py +1 -1
  36. {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_sync.py +23 -26
  37. {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_sync_nested_children.py +39 -44
  38. {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_sync_root.py +35 -55
  39. {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_sync_single_child_fk_on_child.py +35 -48
  40. {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_sync_single_child_fk_on_parent.py +35 -48
  41. {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_trigger.py +3 -2
  42. {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_unique_behaviour.py +2 -10
  43. {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_utils.py +4 -4
  44. {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_view.py +83 -55
  45. {pgsync-2.5.0 → pgsync-3.1.0}/tests/testing_utils.py +5 -3
  46. pgsync-2.5.0/LICENSE +0 -165
  47. pgsync-2.5.0/pgsync/singleton.py +0 -20
  48. pgsync-2.5.0/pgsync/urls.py +0 -99
  49. pgsync-2.5.0/pgsync.egg-info/requires.txt +0 -43
  50. pgsync-2.5.0/pyproject.toml +0 -3
  51. {pgsync-2.5.0 → pgsync-3.1.0}/AUTHORS.rst +0 -0
  52. {pgsync-2.5.0 → pgsync-3.1.0}/CONTRIBUTING.rst +0 -0
  53. {pgsync-2.5.0 → pgsync-3.1.0}/HISTORY.rst +0 -0
  54. {pgsync-2.5.0 → pgsync-3.1.0}/MANIFEST.in +0 -0
  55. {pgsync-2.5.0 → pgsync-3.1.0}/bin/pgsync +0 -0
  56. {pgsync-2.5.0 → pgsync-3.1.0}/docs/Makefile +0 -0
  57. {pgsync-2.5.0 → pgsync-3.1.0}/docs/authors.rst +0 -0
  58. {pgsync-2.5.0 → pgsync-3.1.0}/docs/changelog.rst +0 -0
  59. {pgsync-2.5.0 → pgsync-3.1.0}/docs/conf.py +0 -0
  60. {pgsync-2.5.0 → pgsync-3.1.0}/docs/contributing.rst +0 -0
  61. {pgsync-2.5.0 → pgsync-3.1.0}/docs/history.rst +0 -0
  62. {pgsync-2.5.0 → pgsync-3.1.0}/docs/index.rst +0 -0
  63. {pgsync-2.5.0 → pgsync-3.1.0}/docs/installation.rst +0 -0
  64. {pgsync-2.5.0 → pgsync-3.1.0}/docs/logo.png +0 -0
  65. {pgsync-2.5.0 → pgsync-3.1.0}/docs/make.bat +0 -0
  66. {pgsync-2.5.0 → pgsync-3.1.0}/docs/readme.rst +0 -0
  67. {pgsync-2.5.0 → pgsync-3.1.0}/docs/usage.rst +0 -0
  68. {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/exc.py +0 -0
  69. {pgsync-2.5.0 → pgsync-3.1.0}/pgsync.egg-info/SOURCES.txt +0 -0
  70. {pgsync-2.5.0 → pgsync-3.1.0}/pgsync.egg-info/dependency_links.txt +0 -0
  71. {pgsync-2.5.0 → pgsync-3.1.0}/pgsync.egg-info/not-zip-safe +0 -0
  72. {pgsync-2.5.0 → pgsync-3.1.0}/pgsync.egg-info/top_level.txt +0 -0
  73. {pgsync-2.5.0 → pgsync-3.1.0}/tests/__init__.py +0 -0
  74. {pgsync-2.5.0 → pgsync-3.1.0}/tests/fixtures/schema.json +0 -0
  75. {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_env_vars.py +0 -0
  76. {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_helper.py +0 -0
  77. {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_log_handlers.py +0 -0
  78. {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_query_builder.py +0 -0
  79. {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_transform.py +0 -0
  80. {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_urls.py +0 -0
pgsync-3.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Tolu Aina
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pgsync
3
- Version: 2.5.0
3
+ Version: 3.1.0
4
4
  Summary: Postgres to Elasticsearch/OpenSearch sync
5
5
  Home-page: https://github.com/toluaina/pgsync
6
6
  Author: Tolu Aina
7
7
  Author-email: tolu@pgsync.com
8
8
  Maintainer: Tolu Aina
9
9
  Maintainer-email: tolu@pgsync.com
10
- License: LGPLv3
10
+ License: MIT
11
11
  Project-URL: Bug Reports, https://github.com/toluaina/pgsync/issues
12
12
  Project-URL: Funding, https://github.com/sponsors/toluaina
13
13
  Project-URL: Source, https://github.com/toluaina/pgsync
@@ -17,18 +17,48 @@ Keywords: pgsync,elasticsearch,opensearch,postgres,change data capture
17
17
  Classifier: Development Status :: 5 - Production/Stable
18
18
  Classifier: Intended Audience :: Developers
19
19
  Classifier: Natural Language :: English
20
- Classifier: Programming Language :: Python :: 3.7
21
20
  Classifier: Programming Language :: Python :: 3.8
22
21
  Classifier: Programming Language :: Python :: 3.9
23
22
  Classifier: Programming Language :: Python :: 3.10
24
23
  Classifier: Programming Language :: Python :: 3.11
24
+ Classifier: Programming Language :: Python :: 3.12
25
25
  Classifier: Programming Language :: Python :: Implementation :: CPython
26
26
  Classifier: Programming Language :: Python :: Implementation :: PyPy
27
- Classifier: License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)
28
- Requires-Python: >=3.7.0
27
+ Classifier: License :: OSI Approved :: MIT License
28
+ Classifier: Operating System :: OS Independent
29
+ Requires-Python: >=3.8.0
29
30
  Description-Content-Type: text/markdown
30
31
  License-File: LICENSE
31
32
  License-File: AUTHORS.rst
33
+ Requires-Dist: async-timeout==4.0.3
34
+ Requires-Dist: boto3==1.34.11
35
+ Requires-Dist: botocore==1.34.11
36
+ Requires-Dist: certifi==2023.11.17
37
+ Requires-Dist: charset-normalizer==3.3.2
38
+ Requires-Dist: click==8.1.7
39
+ Requires-Dist: elastic-transport==8.11.0
40
+ Requires-Dist: elasticsearch==8.11.1
41
+ Requires-Dist: elasticsearch-dsl==8.11.0
42
+ Requires-Dist: environs==10.0.0
43
+ Requires-Dist: greenlet==3.0.3
44
+ Requires-Dist: idna==3.6
45
+ Requires-Dist: jmespath==1.0.1
46
+ Requires-Dist: marshmallow==3.20.1
47
+ Requires-Dist: opensearch-dsl==2.1.0
48
+ Requires-Dist: opensearch-py==2.4.2
49
+ Requires-Dist: packaging==23.2
50
+ Requires-Dist: psycopg2-binary==2.9.9
51
+ Requires-Dist: python-dateutil==2.8.2
52
+ Requires-Dist: python-dotenv==1.0.0
53
+ Requires-Dist: redis==5.0.1
54
+ Requires-Dist: requests==2.31.0
55
+ Requires-Dist: requests-aws4auth==1.2.3
56
+ Requires-Dist: s3transfer==0.10.0
57
+ Requires-Dist: six==1.16.0
58
+ Requires-Dist: sqlalchemy==2.0.25
59
+ Requires-Dist: sqlparse==0.4.4
60
+ Requires-Dist: typing-extensions==4.9.0
61
+ Requires-Dist: urllib3==1.26.18
32
62
 
33
63
  # PostgreSQL to Elasticsearch/OpenSearch sync
34
64
 
@@ -40,7 +70,7 @@ expose structured denormalized documents in [Elasticsearch](https://www.elastic.
40
70
 
41
71
  ### Requirements
42
72
 
43
- - [Python](https://www.python.org) 3.7+
73
+ - [Python](https://www.python.org) 3.8+
44
74
  - [Postgres](https://www.postgresql.org) 9.6+
45
75
  - [Redis](https://redis.io) 3.1.0
46
76
  - [Elasticsearch](https://www.elastic.co/products/elastic-stack) 6.3.1+ or [OpenSearch](https://opensearch.org/) 1.3.7+
@@ -66,7 +66,7 @@ the search capabilities of [Elasticsearch](https://www.elastic.co/products/elast
66
66
 
67
67
  #### How it works
68
68
 
69
- PGSync is written in Python (supporting version 3.7 onwards) and the stack is composed of: [Redis](https://redis.io), [Elasticsearch](https://www.elastic.co/products/elastic-stack)/[OpenSearch](https://opensearch.org/), [Postgres](https://www.postgresql.org), and [SQlAlchemy](https://www.sqlalchemy.org).
69
+ PGSync is written in Python (supporting version 3.8 onwards) and the stack is composed of: [Redis](https://redis.io), [Elasticsearch](https://www.elastic.co/products/elastic-stack)/[OpenSearch](https://opensearch.org/), [Postgres](https://www.postgresql.org), and [SQlAlchemy](https://www.sqlalchemy.org).
70
70
 
71
71
  PGSync leverages the [logical decoding](https://www.postgresql.org/docs/current/logicaldecoding.html) feature of [Postgres](https://www.postgresql.org) (introduced in PostgreSQL 9.4) to capture a continuous stream of change events.
72
72
  This feature needs to be enabled in your [Postgres](https://www.postgresql.org) configuration file by setting in the postgresql.conf file:
@@ -152,7 +152,7 @@ Key features of PGSync are:
152
152
 
153
153
  #### Requirements
154
154
 
155
- - [Python](https://www.python.org) 3.7+
155
+ - [Python](https://www.python.org) 3.8+
156
156
  - [Postgres](https://www.postgresql.org) 9.6+
157
157
  - [Redis](https://redis.io) 3.1.0
158
158
  - [Elasticsearch](https://www.elastic.co/products/elastic-stack) 6.3.1+ or [OpenSearch](https://opensearch.org/) 1.3.7+
@@ -305,8 +305,8 @@ Contributions are very welcome! Check out the [Contribution](CONTRIBUTING.rst) G
305
305
 
306
306
  #### License
307
307
 
308
- This code is released under the [GNU Lesser General Public License](https://www.gnu.org/licenses/gpl-3.0.html), version 3.0 (LGPL-3.0).
308
+ This project is licensed under the terms of the [MIT](https://opensource.org/license/mit/) license.
309
309
  Please see [LICENSE](LICENSE) for more details.
310
310
 
311
- You should have received a copy of the GNU Lesser General Public License along with PGSync.
312
- If not, see https://www.gnu.org/licenses/.
311
+ You should have received a copy of the MIT License along with PGSync.
312
+ If not, see https://opensource.org/license/mit/.
@@ -8,7 +8,7 @@ expose structured denormalized documents in [Elasticsearch](https://www.elastic.
8
8
 
9
9
  ### Requirements
10
10
 
11
- - [Python](https://www.python.org) 3.7+
11
+ - [Python](https://www.python.org) 3.8+
12
12
  - [Postgres](https://www.postgresql.org) 9.6+
13
13
  - [Redis](https://redis.io) 3.1.0
14
14
  - [Elasticsearch](https://www.elastic.co/products/elastic-stack) 6.3.1+ or [OpenSearch](https://opensearch.org/) 1.3.7+
@@ -54,9 +54,15 @@ def main(teardown, config, user, password, host, port, verbose):
54
54
 
55
55
  show_settings(config)
56
56
 
57
- for document in config_loader(config):
57
+ validate: bool = False if teardown else True
58
+
59
+ for doc in config_loader(config):
58
60
  sync: Sync = Sync(
59
- document, verbose=verbose, repl_slots=False, **kwargs
61
+ doc,
62
+ verbose=verbose,
63
+ validate=validate,
64
+ repl_slots=False,
65
+ **kwargs,
60
66
  )
61
67
  if teardown:
62
68
  sync.teardown()
@@ -1,44 +1,43 @@
1
1
  #!/usr/bin/env python
2
2
 
3
3
  """
4
- Parallel sync is an experimental feature that leverages the available
5
- CPU's/Threads to increase throughput.
6
- This is can be useful for environments that have a high network latency.
7
-
8
- In this scenario, your PG database, Elasticsearch/OpenSearch, and PGSync
9
- servers are on different networks with a delay between request/response time.
10
- The main bottleneck, in this case, is usually the roundtrip of the database
11
- query.
12
-
13
- Even with server-side cursors, we are still only able to fetch
14
- a limited number of records at a time from the cursor.
15
- The delay in the next cursor fetch can slow down the overall sync
16
- considerably.
17
-
18
- The solution here is to perform an initial fast/parallel sync
19
- to populate Elasticsearch/OpenSearch in a single iteration.
20
- When this is complete, we can then continue to run the normal `pgsync`
21
- as a daemon.
22
-
23
- This approach uses the Tuple identifier record of the table columns.
24
- Each table contains a system column - "ctid" of type "tid" that
25
- identifies the page record and row number in each block.
26
-
27
- We can use this to paginate the sync process.
28
- Pagination here technically implies that we are splitting each paged record
29
- between CPU's/Threads.
30
-
31
- This allows us to perform Elasticserch/OpenSearch bulk inserts in parallel.
32
- The "ctid" is a tuple of (page, row-number) e.g (1, 5) that identifies the
33
- row in a disk page.
34
-
35
- This method allows us to fetch all paged row records upfront and split them
36
- into work units amongst the workers(threads/cpus).
37
- Each chunk of work is defined by the BLOCK_SIZE and corresponds to the number
38
- of root node records each worker needs to process.
39
-
40
- The worker's query for each chunk of work filtering by the page number
41
- and row numbers.
4
+ Parallel sync is an innovative, experimental feature designed to optimize
5
+ throughput by utilizing available CPUs/threads, particularly beneficial
6
+ in environments experiencing high network latency.
7
+
8
+ Scenario & Challenge:
9
+ In instances where your PG database, Elasticsearch/OpenSearch, and PGSync
10
+ servers operate on divergent networks, a delay in request/response time is
11
+ noticeable. The primary constraint emerges from the database query's roundtrip,
12
+ which even server-side cursors can address only to a limited extent by fetching
13
+ a certain number of records at a time. The consequent delay in fetching the
14
+ next cursor significantly hampers the overall synchronization speed.
15
+
16
+ Solution:
17
+ To mitigate this, the strategy is to conduct an initial fast/parallel sync,
18
+ thereby populating Elasticsearch/OpenSearch in a single iteration.
19
+ Post this, the regular pgsync can continue running as a daemon.
20
+
21
+ Approach and Technical Implementation:
22
+ The approach centers around utilizing the Tuple identifier record of the table
23
+ columns. Every table incorporates a system column – "ctid" of type "tid,"
24
+ which helps identify the page record and the row number in each block.
25
+ This element facilitates the pagination of the sync process.
26
+
27
+ Technically, pagination implies dividing each paged record amongst the
28
+ available CPUs/threads. This division enables the parallel execution of
29
+ Elasticsearch/OpenSearch bulk inserts. The "ctid" serves as a tuple
30
+ (for instance, (1, 5)), pinpointing the row in a disk page.
31
+
32
+ By leveraging this method, all paged row records are retrieved upfront and
33
+ allocated as work units across the worker threads/CPUs.
34
+ Each work unit, defined by the BLOCK_SIZE, denotes the number of root node
35
+ records assigned for each worker to process.
36
+
37
+ Subsequently, the workers execute queries for each assigned chunk of work,
38
+ filtered based on the page number and row numbers.
39
+ This systematic and parallel approach optimizes the synchronization process,
40
+ especially in environments challenged by network latency.
42
41
  """
43
42
 
44
43
  import asyncio
@@ -46,39 +45,50 @@ import multiprocessing
46
45
  import os
47
46
  import re
48
47
  import sys
48
+ import typing as t
49
49
  from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
50
50
  from dataclasses import dataclass
51
51
  from queue import Queue
52
52
  from threading import Thread
53
- from typing import Generator, Optional, Union
54
53
 
55
54
  import click
56
55
  import sqlalchemy as sa
57
56
 
58
57
  from pgsync.settings import BLOCK_SIZE, CHECKPOINT_PATH
59
58
  from pgsync.sync import Sync
60
- from pgsync.utils import (
61
- compiled_query,
62
- config_loader,
63
- get_config,
64
- show_settings,
65
- timeit,
66
- )
59
+ from pgsync.utils import config_loader, get_config, show_settings, timeit
60
+
67
61
 
62
+ def save_ctid(page: int, row: int, filename: str) -> None:
63
+ """
64
+ Save the checkpoint for a given page and row in a file with the given name.
68
65
 
69
- def save_ctid(page: int, row: int, name: str) -> None:
70
- checkpoint_file: str = os.path.join(CHECKPOINT_PATH, f".{name}.ctid")
71
- with open(checkpoint_file, "w+") as fp:
66
+ Args:
67
+ page (int): The page number to save.
68
+ row (int): The row number to save.
69
+ filename (str): The name of the file to save the checkpoint in.
70
+ """
71
+ filepath: str = os.path.join(CHECKPOINT_PATH, f".{filename}.ctid")
72
+ with open(filepath, "w+") as fp:
72
73
  fp.write(f"{page},{row}\n")
73
74
 
74
75
 
75
- def read_ctid(name: str) -> None:
76
- checkpoint_file: str = os.path.join(CHECKPOINT_PATH, f".{name}.ctid")
77
- if os.path.exists(checkpoint_file):
78
- with open(checkpoint_file, "r") as fp:
76
+ def read_ctid(filename: str) -> t.Tuple[t.Optional[int], t.Optional[int]]:
77
+ """
78
+ Reads the checkpoint file for the given name and returns the page and row numbers.
79
+
80
+ Args:
81
+ filename (str): The name of the checkpoint file.
82
+
83
+ Returns:
84
+ tuple: A tuple containing the page and row numbers. If the checkpoint file does not exist, returns (None, None).
85
+ """
86
+ filepath: str = os.path.join(CHECKPOINT_PATH, f".{filename}.ctid")
87
+ if os.path.exists(filepath):
88
+ with open(filepath, "r") as fp:
79
89
  pairs: str = fp.read().split()[0].split(",")
80
- page = int(pairs[0])
81
- row = int(pairs[1])
90
+ page: int = int(pairs[0])
91
+ row: int = int(pairs[1])
82
92
  return page, row
83
93
  return None, None
84
94
 
@@ -104,7 +114,6 @@ class Task:
104
114
  sync: Sync = Sync(
105
115
  self.doc, verbose=self.verbose, validate=self.validate
106
116
  )
107
- sync.tree.build(sync.nodes)
108
117
  txmin: int = sync.checkpoint
109
118
  txmax: int = sync.txid_current
110
119
  sync.search_client.bulk(
@@ -118,19 +127,19 @@ class Task:
118
127
  @timeit
119
128
  def fetch_tasks(
120
129
  doc: dict,
121
- block_size: Optional[int] = None,
122
- ) -> Generator:
130
+ block_size: t.Optional[int] = None,
131
+ ) -> t.Generator:
123
132
  block_size = block_size or BLOCK_SIZE
124
133
  pages: dict = {}
125
134
  sync: Sync = Sync(doc)
126
- page: Optional[int] = None
127
- row: Optional[int] = None
128
- name: str = re.sub(
135
+ page: t.Optional[int] = None
136
+ row: t.Optional[int] = None
137
+ filename: str = re.sub(
129
138
  "[^0-9a-zA-Z_]+", "", f"{sync.database.lower()}_{sync.index}"
130
139
  )
131
- page, row = read_ctid(name=name)
140
+ page, row = read_ctid(filename)
132
141
  statement: sa.sql.Select = sa.select(
133
- [
142
+ *[
134
143
  sa.literal_column("1").label("x"),
135
144
  sa.literal_column("1").label("y"),
136
145
  sa.column("ctid"),
@@ -197,11 +206,13 @@ def fetch_tasks(
197
206
 
198
207
  @timeit
199
208
  def synchronous(
200
- tasks: Generator, doc: dict, verbose: bool = False, validate: bool = False
209
+ tasks: t.Generator,
210
+ doc: dict,
211
+ verbose: bool = False,
212
+ validate: bool = False,
201
213
  ) -> None:
202
214
  sys.stdout.write("Synchronous\n")
203
215
  sync: Sync = Sync(doc, verbose=verbose, validate=validate)
204
- sync.tree.build(sync.nodes)
205
216
  txmin: int = sync.checkpoint
206
217
  txmax: int = sync.txid_current
207
218
  index: str = sync.index
@@ -215,9 +226,9 @@ def synchronous(
215
226
 
216
227
  @timeit
217
228
  def multithreaded(
218
- tasks: Generator,
229
+ tasks: t.Generator,
219
230
  doc: dict,
220
- nprocs: Optional[int] = None,
231
+ nthreads: t.Optional[int] = None,
221
232
  verbose: bool = False,
222
233
  validate: bool = False,
223
234
  ) -> None:
@@ -234,12 +245,11 @@ def multithreaded(
234
245
  )
235
246
  queue.task_done()
236
247
 
237
- nprocs: int = nprocs or 1
248
+ nthreads: int = nthreads or 1
238
249
  queue: Queue = Queue()
239
250
  sync: Sync = Sync(doc, verbose=verbose, validate=validate)
240
- sync.tree.build(sync.nodes)
241
251
 
242
- for _ in range(nprocs):
252
+ for _ in range(nthreads):
243
253
  thread: Thread = Thread(
244
254
  target=worker,
245
255
  args=(
@@ -258,15 +268,15 @@ def multithreaded(
258
268
 
259
269
  @timeit
260
270
  def multiprocess(
261
- tasks: Generator,
271
+ tasks: t.Generator,
262
272
  doc: dict,
263
- nprocs: Optional[int] = None,
273
+ ncpus: t.Optional[int] = None,
264
274
  verbose: bool = False,
265
275
  validate: bool = False,
266
276
  ) -> None:
267
277
  sys.stdout.write("Multiprocess\n")
268
278
  task: Task = Task(doc, verbose=verbose, validate=validate)
269
- with ProcessPoolExecutor(max_workers=nprocs) as executor:
279
+ with ProcessPoolExecutor(max_workers=ncpus) as executor:
270
280
  try:
271
281
  list(executor.map(task.process, tasks))
272
282
  except Exception as e:
@@ -276,14 +286,14 @@ def multiprocess(
276
286
 
277
287
  @timeit
278
288
  def multithreaded_async(
279
- tasks: Generator,
289
+ tasks: t.Generator,
280
290
  doc: dict,
281
- nprocs: Optional[int] = None,
291
+ nthreads: t.Optional[int] = None,
282
292
  verbose: bool = False,
283
293
  validate: bool = False,
284
294
  ) -> None:
285
295
  sys.stdout.write("Multi-threaded async\n")
286
- executor: ThreadPoolExecutor = ThreadPoolExecutor(max_workers=nprocs)
296
+ executor: ThreadPoolExecutor = ThreadPoolExecutor(max_workers=nthreads)
287
297
  event_loop = asyncio.get_event_loop()
288
298
  event_loop.run_until_complete(
289
299
  run_tasks(executor, tasks, doc, verbose=verbose, validate=validate)
@@ -293,14 +303,14 @@ def multithreaded_async(
293
303
 
294
304
  @timeit
295
305
  def multiprocess_async(
296
- tasks: Generator,
306
+ tasks: t.Generator,
297
307
  doc: dict,
298
- nprocs: Optional[int] = None,
308
+ ncpus: t.Optional[int] = None,
299
309
  verbose: bool = False,
300
310
  validate: bool = False,
301
311
  ) -> None:
302
312
  sys.stdout.write("Multi-process async\n")
303
- executor: ProcessPoolExecutor = ProcessPoolExecutor(max_workers=nprocs)
313
+ executor: ProcessPoolExecutor = ProcessPoolExecutor(max_workers=ncpus)
304
314
  event_loop = asyncio.get_event_loop()
305
315
  try:
306
316
  event_loop.run_until_complete(
@@ -312,18 +322,18 @@ def multiprocess_async(
312
322
 
313
323
 
314
324
  async def run_tasks(
315
- executor: Union[ThreadPoolExecutor, ProcessPoolExecutor],
316
- tasks: Generator,
325
+ executor: t.Union[ThreadPoolExecutor, ProcessPoolExecutor],
326
+ tasks: t.Generator,
317
327
  doc: dict,
318
328
  verbose: bool = False,
319
329
  validate: bool = False,
320
330
  ) -> None:
321
- sync: Optional[Sync] = None
331
+ sync: t.Optional[Sync] = None
322
332
  if isinstance(executor, ThreadPoolExecutor):
323
333
  # threads can share a common Sync object
324
334
  sync = Sync(doc, verbose=verbose, validate=validate)
325
335
  event_loop = asyncio.get_event_loop()
326
- completed, pending = await asyncio.wait(
336
+ completed, _ = await asyncio.wait(
327
337
  [
328
338
  event_loop.run_in_executor(
329
339
  executor, run_task, task, sync, doc, verbose, validate
@@ -338,14 +348,13 @@ async def run_tasks(
338
348
 
339
349
  def run_task(
340
350
  task: dict,
341
- sync: Optional[Sync] = None,
342
- doc: Optional[dict] = None,
351
+ sync: t.Optional[Sync] = None,
352
+ doc: t.Optional[dict] = None,
343
353
  verbose: bool = False,
344
354
  validate: bool = False,
345
355
  ) -> int:
346
356
  if sync is None:
347
357
  sync: Sync = Sync(doc, verbose=verbose, validate=validate)
348
- sync.tree.build(sync.nodes)
349
358
  txmin: int = sync.checkpoint
350
359
  txmax: int = sync.txid_current
351
360
  sync.search_client.bulk(
@@ -355,10 +364,10 @@ def run_task(
355
364
  if len(task) > 0:
356
365
  page: int = max(task.keys())
357
366
  row: int = max(task[page])
358
- name: str = re.sub(
367
+ filename: str = re.sub(
359
368
  "[^0-9a-zA-Z_]+", "", f"{sync.database.lower()}_{sync.index}"
360
369
  )
361
- save_ctid(page=page, row=row, name=name)
370
+ save_ctid(page, row, filename)
362
371
 
363
372
  return 1
364
373
 
@@ -410,20 +419,18 @@ def main(config, nprocs, mode, verbose):
410
419
  show_settings()
411
420
  config: str = get_config(config)
412
421
 
413
- for document in config_loader(config):
414
- tasks: Generator = fetch_tasks(document)
422
+ for doc in config_loader(config):
423
+ tasks: t.Generator = fetch_tasks(doc)
415
424
  if mode == "synchronous":
416
- synchronous(tasks, document, verbose=verbose)
425
+ synchronous(tasks, doc, verbose=verbose)
417
426
  elif mode == "multithreaded":
418
- multithreaded(tasks, document, nprocs=nprocs, verbose=verbose)
427
+ multithreaded(tasks, doc, nthreads=nprocs, verbose=verbose)
419
428
  elif mode == "multiprocess":
420
- multiprocess(tasks, document, nprocs=nprocs, verbose=verbose)
429
+ multiprocess(tasks, doc, ncpus=nprocs, verbose=verbose)
421
430
  elif mode == "multithreaded_async":
422
- multithreaded_async(
423
- tasks, document, nprocs=nprocs, verbose=verbose
424
- )
431
+ multithreaded_async(tasks, doc, nthreads=nprocs, verbose=verbose)
425
432
  elif mode == "multiprocess_async":
426
- multiprocess_async(tasks, document, nprocs=nprocs, verbose=verbose)
433
+ multiprocess_async(tasks, doc, ncpus=nprocs, verbose=verbose)
427
434
 
428
435
 
429
436
  if __name__ == "__main__":
@@ -2,4 +2,4 @@
2
2
 
3
3
  __author__ = "Tolu Aina"
4
4
  __email__ = "tolu@pgsync.com"
5
- __version__ = "2.5.0"
5
+ __version__ = "3.1.0"