PyPI - shareddata - Versions diffs - 6.83.8__tar.gz → 6.83.12__tar.gz - Mend

shareddata 6.83.8tar.gz → 6.83.12tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (94) hide show

{shareddata-6.83.8/src/shareddata.egg-info → shareddata-6.83.12}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: shareddata
-Version: 6.83.8
+Version: 6.83.12
 Summary: Memory Mapped / Shared Memory Database with S3 repository
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: MIT License

{shareddata-6.83.8 → shareddata-6.83.12}/setup.py RENAMED Viewed

@@ -34,7 +34,7 @@ install_requires = [
 setup(
     name='shareddata',
-    version='6.83.8',
+    version='6.83.12',
     description='Memory Mapped / Shared Memory Database with S3 repository',
     long_description=open('README.md').read(),
     long_description_content_type='text/markdown',

{shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/CollectionMongoDB.py RENAMED Viewed

@@ -112,7 +112,7 @@ class CollectionMongoDB:
             # Check for indexes other than the default _id index
             self.hasindex = any(index_name == pkey_name for index_name in index_info)
             # If index was requested but the collection was previously created without one, create it now
-            if hasindex and not self.hasindex:
+            if hasindex and not self.hasindex and self.period in ['D1', 'M15', 'M1']:
                 self.mongodb.client[self.user][self.relpath].create_index(pkey_fields, unique=True, name=pkey_name)
                 self.mongodb.client[self.user][self.relpath].create_index([("mtime", DESCENDING)])
                 self.hasindex = True

{shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/Logger.py RENAMED Viewed

@@ -20,6 +20,36 @@ import json
 import requests
 import lz4
+# Column layout of the semicolon-separated log files under DATABASE_FOLDER/Logs.
+# 6 separators => 7 fields; the 7th (message) may itself contain ';'.
+_LOG_COLUMNS = ['shardid', 'sequence_number', 'user_name',
+                'asctime', 'logger_name', 'level', 'message']
+def _parse_log_lines(text):
+    """Parse ';'-delimited log text into a DataFrame — stdlib only.
+    Uses str.split(';', 6) so any ';' inside the message field is preserved
+    verbatim (matching the prior `shape[1] > 7 -> merge cols 6..` behavior
+    that the pandas-based readLogs used to handle that case).
+    No pandas C parser is involved, so this cannot trip the pandas_parser
+    SIGSEGV that Logger.readLogs kept hitting at base_parser._infer_types.
+    Lines with fewer than 7 fields or empty asctime are dropped, matching
+    the previous on_bad_lines='skip' + dfnewlines[asctime].notna() filter.
+    """
+    rows = []
+    append = rows.append
+    for line in text.splitlines():
+        if not line:
+            continue
+        parts = line.split(';', 6)
+        if len(parts) < 7 or not parts[3]:
+            continue
+        append(parts)
+    return pd.DataFrame(rows, columns=_LOG_COLUMNS)
 from SharedData.IO.LogHandlerAPI import LogHandlerAPI
@@ -287,13 +317,11 @@ class Logger:
             ((pd.Timestamp.utcnow() + timedelta(days=-1)).strftime('%Y%m%d')+'.log')
         if lastlogfilepath.is_file():
             try:
-                _dflogs = pd.read_csv(lastlogfilepath, header=None, sep=';',
-                                      engine='python', on_bad_lines='skip')
-                _dflogs.columns = ['shardid', 'sequence_number',
-                                   'user_name', 'asctime', 'logger_name', 'level', 'message']
+                with open(lastlogfilepath, 'r', encoding='utf-8', errors='replace') as _f:
+                    _dflogs = _parse_log_lines(_f.read())
                 Logger.dflogs = pd.concat([_dflogs, Logger.dflogs], axis=0, ignore_index=True)
                 Logger.getLastLog(Logger.dflogs)
-                Logger.getStatus(Logger.dflogs)
+                Logger.getStatus(Logger.dflogs)
             except Exception as e:
                 print(f'Error reading last day logs: {e}')
@@ -355,24 +383,29 @@ class Logger:
             Logger.logfilepath = _logfilepath
         if Logger.logfilepath.is_file():
-            with open(Logger.logfilepath, 'r') as file:
+            # Binary mode so _logfileposition is a plain byte offset with
+            # well-defined seek() semantics (text-mode tell() cookies cannot
+            # be manipulated arithmetically).
+            with open(Logger.logfilepath, 'rb') as file:
                 file.seek(Logger._logfileposition)
-                newlines = '\n'.join(file.readlines())
-                if not newlines.strip():   # fix: prevent pd.read_csv crash on empty string
+                raw = file.read()
+                # Only consume up to the last complete line. A trailing
+                # partial (writer still mid-flush, or a kill -9 that left a
+                # half-line behind) is left in place so the next tick picks
+                # it up once '\n' is written. This is stricter than the old
+                # pandas path, which accepted the partial and advanced past
+                # it, silently losing the remainder bytes.
+                last_nl = raw.rfind(b'\n')
+                if last_nl < 0:
                     return dfnewlines
-                dfnewlines = pd.read_csv(StringIO(newlines), header=None, sep=';',
-                                            engine='python', on_bad_lines='skip')
-                if dfnewlines.shape[1] > 7:
-                    # Merge all columns from 6 onward into a single message
-                    message = dfnewlines.iloc[:, 6:].apply(lambda x: ';'.join(x.dropna().astype(str)), axis=1)
-                    dfnewlines = dfnewlines.iloc[:, :7]
-                    dfnewlines.iloc[:, 6] = message
-                dfnewlines.columns = [
-                    'shardid', 'sequence_number', 'user_name', 'asctime', 'logger_name', 'level', 'message'
-                ]
-                dfnewlines = dfnewlines[dfnewlines['asctime'].notna()]
+                complete_bytes = raw[:last_nl + 1]
+                text = complete_bytes.decode('utf-8', errors='replace')
+                Logger._logfileposition += len(complete_bytes)
+                # _parse_log_lines drops empty lines and rows whose asctime
+                # field is empty, folding the prior on_bad_lines='skip' +
+                # dfnewlines[asctime].notna() filter into the parse step.
+                dfnewlines = _parse_log_lines(text)
                 if dfnewlines.empty:
                     return dfnewlines
@@ -401,12 +434,11 @@ class Logger:
                     Logger._max_asctime = max(
                         max_asctime,
                         dfnewlines_sorted['asctime'].iloc[-1]
-                    )
-                Logger._logfileposition = file.tell()
+                    )
+                # _logfileposition was already advanced above, right after we
+                # decided which prefix of the read to consume.
                 return dfnewlines_sorted
         return dfnewlines
     @staticmethod

{shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/Routines/WorkerPool.py RENAMED Viewed

@@ -519,18 +519,27 @@ class WorkerPool:
             time.sleep(60)
     @staticmethod
-    def get_active_jobs(shdata):
-        # keep track of active jobs
+    def get_active_jobs(shdata, acquire=True):
+        # Schemaless reads must hold the table mutex: the writer updates
+        # bson bytes, bson_size and bson_ptr as separate non-atomic steps,
+        # and a lock-free reader can catch mid-update state and fail to
+        # decode BSON ("bad eoo that should never happen").
         job_table = WorkerPool.get_job_table(shdata)
-        buff = np.full((1,),dtype=job_table.dtype,fill_value=np.nan)
-        buff['hash'][0] = 'ACTIVE_JOBS'
-        loc = job_table.get_loc(buff, acquire=False)
-        if loc[0] != -1:
-            active_jobs = job_table.get_dict_list(job_table[loc[0]],acquire=False)[0]
-        else:
-            active_jobs = {
-                'hash': 'ACTIVE_JOBS',
-                'status': 'ACTIVE',
-                'jobs': {},
-            }
+        try:
+            if acquire:
+                job_table.acquire()
+            buff = np.full((1,),dtype=job_table.dtype,fill_value=np.nan)
+            buff['hash'][0] = 'ACTIVE_JOBS'
+            loc = job_table.get_loc(buff, acquire=False)
+            if loc[0] != -1:
+                active_jobs = job_table.get_dict_list(job_table[loc[0]],acquire=False)[0]
+            else:
+                active_jobs = {
+                    'hash': 'ACTIVE_JOBS',
+                    'status': 'ACTIVE',
+                    'jobs': {},
+                }
+        finally:
+            if acquire:
+                job_table.release()
         return active_jobs

{shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/SharedData.py RENAMED Viewed

@@ -541,14 +541,34 @@ class SharedData:
             names=names,formats=formats,size=100e3)
     @staticmethod
-    def mutex(shm_name, pid):
+    def mutex(shm_name, pid):
         """
-        Create or attach to a shared memory mutex object and register the process ID.
+        Allocate (or attach to) the per-table MUTEX shared-memory struct AND
+        ACQUIRE its `pid` lock. Returns once the lock is held by `pid`.
+        =====================================================================
+        IMPORTANT — this ACQUIRES the per-table mutex as a side effect.
+        =====================================================================
+        Called from `Table.init_schema()`; `Table.__init__` then keeps the
+        lock held for the remainder of `__init__`, releasing it only in the
+        `finally: self.release()` at the bottom. Every code path reachable
+        from `Table.__init__` — incl. `create_table()`, `load_table()`,
+        `read_header()`, `download()`, `TableDisk.malloc()`,
+        `index.initialize()`, `index.malloc_pkey()`, `create_index()` —
+        runs with this table's own mutex already held. Do NOT call
+        `acquire()` (or anything that internally re-acquires this same
+        mutex, e.g. `self.acquire()` on the same Table) from anywhere in
+        that call tree: the mutex is a non-reentrant CAS spinlock and
+        re-acquiring it self-deadlocks (the staleness check in `acquire()`
+        only force-acquires when the holder PID is *dead* — a self-
+        deadlocked caller is still alive).
+        Original docstring follows:
         This static method attempts to create a shared memory segment with a specific name
         appended by '#mutex' to serve as a mutex structure containing process ID, type, and load status.
         If the shared memory segment already exists, it attaches to it instead.
         It then acquires the mutex using the SharedData.acquire method, registers the calling process ID
         by appending it to a CSV file located in the DATABASE_FOLDER environment path, and returns
         the shared memory object, the mutex numpy structured array, and a flag indicating whether the
@@ -588,8 +608,43 @@ class SharedData:
     @staticmethod
     def acquire(mutex, pid, relpath):
         """
+        Spin-CAS acquire of a per-table MUTEX's `pid` field (1us sleep loop).
+        Atomically sets `mutex.pid = pid` once it observes `mutex.pid == 0`.
+        Staleness recovery: if the current holder PID is *dead*
+        (`psutil.pid_exists` is False), force-acquires by CASing
+        `holder_pid -> pid`.
+        =====================================================================
+        *** NON-REENTRANT ***
+        =====================================================================
+        This is a CAS-based spinlock with NO owner-recursion tracking. If the
+        current process already holds the mutex, calling `acquire()` again
+        SELF-DEADLOCKS: the CAS against 0 never succeeds (the field holds our
+        own pid, not 0), and the staleness check only force-acquires when the
+        holder is DEAD — which a self-deadlocked caller is not.
+        Note especially that `Table.__init__` holds this mutex throughout its
+        lifetime (acquired in `init_schema()` via `SharedData.mutex()`,
+        released only at `__init__`'s `finally`). Code reachable from
+        `__init__` — `load_table()`, `create_table()`, `read_header()`,
+        `TableDisk.malloc()`, `TableIndex.initialize()`, `create_index()`,
+        etc. — must NOT call `acquire()` on the table's own mutex.
+        =====================================================================
+        *** BRIEF-HOLD CONTRACT ***
+        =====================================================================
+        The mutex protects short shared-memory critical sections (one upsert /
+        one batch_extend / one get_loc / the `__init__` setup) and is shared
+        across Python AND C++ writers. Do NOT hold it across slow I/O — every
+        waiter spins at 1us granularity, burning CPU. On the singleton SCHEMA
+        table this is catastrophic: stalling SCHEMA stalls every process that
+        opens any table (because every Table open resolves its folder via
+        `schema.get_loc()`).
+        Original docstring follows:
         Attempt to acquire a process-safe mutex semaphore by atomically setting its value to the current process ID (pid).
         This method uses a compare-and-swap atomic operation to acquire the mutex. If the mutex is already held by another process, it waits in a loop, checking every microsecond. If the mutex remains held beyond certain time thresholds (1 second on the first check, 15 seconds thereafter), it verifies whether the locking process is still active. If the locking process has terminated, it forcibly acquires the mutex. The method logs a warning if waiting for the semaphore continues beyond the initial timeout.
         Parameters:
@@ -601,12 +656,27 @@ class SharedData:
             None
         """
         tini = time.time()
+        tstart = tini  # total-wait timer; unlike tini it is never reset below
+        # The SCHEMA table is the singleton every process depends on (every
+        # Table open resolves its folder via schema.get_loc()). The staleness
+        # check below only recovers a DEAD holder — an alive-but-stuck holder
+        # (e.g. a re-entrant acquire) would otherwise hang every dependent
+        # process forever. So for the SCHEMA table only, abort with an error
+        # after 30s instead of spinning indefinitely.
+        schema_timeout = 30.0 if '/SCHEMA/table/' in relpath else None
         # semaphore is process safe
         telapsed = 0
         hdrptr = mutex.__array_interface__['data'][0]
         semseek = 0
         firstcheck = True
         while cpp.long_compare_and_swap(hdrptr, semseek, 0, pid) == 0:
+            if (schema_timeout is not None) and (time.time() - tstart > schema_timeout):
+                lockingpid = int(mutex['pid'])
+                errmsg = ('%s SCHEMA semaphore still held by pid %d after %.0fs — '
+                          'aborting acquire to avoid hanging every dependent process'
+                          % (relpath, lockingpid, schema_timeout))
+                Logger.log.error(errmsg)
+                raise TimeoutError(errmsg)
             # check if process that locked the mutex is still running
             telapsed = time.time() - tini
             if (telapsed > 15) | ((firstcheck) & (telapsed > 1)):
@@ -771,15 +841,11 @@ class SharedData:
         dfcollections = self.list_collections(keyword,user=user)
-        ls = dfremote.copy()
-        # merge local
-        ls = ls.reindex(index=ls.index.union(dflocal.index),
-                        columns=ls.columns.union(dflocal.columns))
-        ls.loc[dflocal.index,dflocal.columns] = dflocal.values
-        # merge collections
-        ls = ls.reindex(index=ls.index.union(dfcollections.index),
-                        columns=ls.columns.union(dfcollections.columns))
-        ls.loc[dfcollections.index,dfcollections.columns] = dfcollections.values
+        # merge local then collections; combine_first preserves dtypes (avoids
+        # FutureWarning from assigning object/datetime values into reindexed
+        # float64 NaN columns).
+        ls = dflocal.combine_first(dfremote)
+        ls = dfcollections.combine_first(ls)
         if len(ls)>0:
             ls = ls.reindex(columns=['folder_local', 'created_local', 'last_modified_local','size_local','files_local',
@@ -990,12 +1056,10 @@ class SharedData:
         dflocal = self.list_local(keyword,user=user)
         Logger.log.info(f'list_local took {time.time()-ti:.2f} seconds')
-        ls = dfremote.copy()
-        # merge local
-        ls = ls.reindex(index=ls.index.union(dflocal.index),
-                        columns=ls.columns.union(dflocal.columns))
-        ls.loc[dflocal.index,dflocal.columns] = dflocal.values
+        # combine_first preserves dtypes (avoids FutureWarning from assigning
+        # object/datetime values into reindexed float64 NaN columns).
+        ls = dflocal.combine_first(dfremote)
         if len(ls)>0:
             ls = ls.reindex(columns=['folder_local', 'created_local', 'last_modified_local','size_local','files_local',
                     'folder_remote', 'last_modified_remote', 'size_remote', 'files_remote','storage_class_remote',
@@ -1049,12 +1113,10 @@ class SharedData:
         dfremote = self.list_remote(keyword, user=user)
         dflocal = self.list_local(keyword, user=user)
-        ls = dfremote.copy()
-        # merge local
-        ls = ls.reindex(index=ls.index.union(dflocal.index),
-                        columns=ls.columns.union(dflocal.columns))
-        ls.loc[dflocal.index, dflocal.columns] = dflocal.values
+        # combine_first preserves dtypes (avoids FutureWarning from assigning
+        # object/datetime values into reindexed float64 NaN columns).
+        ls = dflocal.combine_first(dfremote)
         if len(ls) > 0:
             ls = ls.reindex(columns=['folder_local', 'created_local', 'last_modified_local','size_local','files_local',
                     'folder_remote', 'last_modified_remote', 'size_remote', 'files_remote','storage_class_remote',

{shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/Table.py RENAMED Viewed

@@ -146,10 +146,20 @@ class Table:
         self.index = TableIndex(self)
         errmsg = ''
-        try:
+        try:
+            # init_schema() ACQUIRES this table's per-table mutex (via
+            # SharedData.mutex() -> SharedData.acquire()) and the lock is
+            # held until `finally: self.release()` at the bottom of this
+            # try/except. Every method called below — create_table(),
+            # load_table(), read_header(), TableDisk.malloc(),
+            # index.initialize(), index.create_index() — therefore runs
+            # WITH the mutex already held. Do NOT call self.acquire() (or
+            # anything that re-acquires this same mutex) from anywhere
+            # reachable from here: the mutex is a non-reentrant CAS
+            # spinlock and re-acquiring it self-deadlocks. See the
+            # SharedData.mutex / SharedData.acquire docstrings.
             self.init_schema()
             if (not self.exists) | (self.overwrite):
                 if not records is None:
@@ -190,8 +200,18 @@ class Table:
     def init_schema(self):
         """
-        Initialize the schema and related attributes for the table instance.
+        Initialize the schema and related attributes for the table instance,
+        AND ACQUIRE this table's per-table mutex (via `SharedData.mutex()` ->
+        `SharedData.acquire()`). The mutex is held from here until
+        `Table.__init__`'s `finally: self.release()` at the bottom — so
+        every method called from `__init__` (create_table, load_table,
+        read_header, malloc, index.initialize, ...) runs with the mutex
+        already held. It is non-reentrant; do not re-acquire it from
+        anywhere reachable from `__init__`. See `SharedData.mutex` /
+        `SharedData.acquire` docstrings for the full contract.
+        Original docstring follows:
         This method sets up header and tail header formats and names, initializes mutexes for shared memory access,
         determines the storage path based on user, database, period, source, and tablename, and manages schema information
         from shareddata. It handles synchronization and consistency checks for the table's loading type, updates or inserts
@@ -412,6 +432,14 @@ class Table:
     def create_table(self):
         # create new table or overwrite existing table
         """
+        Called from `Table.__init__` for a new or overwrite-requested table.
+        Runs WITH this table's per-table mutex already held — `init_schema()`
+        acquired it before this is called, and it is released only in
+        `__init__`'s `finally`. Therefore TableDisk.malloc(create=True)'s
+        `shf_hdr[0] = self.hdr` write below is serialized against C++ writers
+        (which acquire the same per-table mutex for `batch_extend`/`upsert`)
+        — it is NOT a lock-free clobber. Do NOT acquire the mutex again here.
         Create a new table or overwrite an existing one with the provided records.
         This method performs the following steps:
@@ -733,6 +761,19 @@ class Table:
     def load_table(self):
         # open existing table
         """
+        Called from `Table.__init__` for an existing table. Runs WITH this
+        table's per-table mutex already held — `init_schema()` acquired it
+        before this is called, and it is released only in `__init__`'s
+        `finally`. Therefore `read_header()` reads a quiescent header (no
+        C++ writer can mutate it concurrently — they take the same mutex
+        for `batch_extend`/`upsert`), and `TableDisk.malloc(create=True)`'s
+        `shf_hdr[0] = self.hdr` writes back the SAME bytes under the SAME
+        lock — it is idempotent, NOT a clobber. Do NOT acquire the mutex
+        again here: it is non-reentrant and re-acquire self-deadlocks. See
+        `SharedData.mutex` / `SharedData.acquire` docstrings.
+        Original docstring follows:
         Load data into the table, ensuring the local file is available and up to date.
         If a local table file exists, it reads the header from the file. If the table is not loaded or the local file does not exist, it downloads the table data. Memory allocation for the table is then performed.
@@ -746,23 +787,48 @@ class Table:
             with open(self.filepath, 'rb') as io_obj:
                 self.read_header(io_obj)
-        if ((self.mutex['isloaded']==0) | (not self.exists_local)):
-            self.download()
-        self.malloc(create=True)
+        # NOTE: this attach/cold-init split is an OPTIMIZATION ONLY — both
+        # paths produce equivalent state and both are concurrency-safe.
+        # `Table.__init__` holds this table's per-table mutex throughout
+        # (acquired in `init_schema()`, released in `__init__`'s `finally`),
+        # and C++ writers acquire the same mutex for `batch_extend`/`upsert`,
+        # so any concurrent C++ writer is blocked while we are in here.
+        # Therefore the historical `malloc(create=True)`'s
+        # `shf_hdr[0] = self.hdr` is idempotent — it writes back exactly the
+        # bytes `read_header()` just read under the same lock — and is NOT a
+        # live-writer clobber. The ATTACH path simply skips that idempotent
+        # rewrite (and the no-op `index.initialize()` on a healthy live
+        # table). Earlier comments here claimed `malloc(create=True)` would
+        # clobber a live writer; that premise was wrong (the mutex
+        # serializes them). Do NOT call `self.acquire()` anywhere below —
+        # the mutex is non-reentrant and re-acquire self-deadlocks. See
+        # `SharedData.mutex` / `SharedData.acquire` docstrings.
+        attach_only = (self.exists_local
+                       and (self.mutex['isloaded'] == 1)
+                       and (not self.header_changed))
+        if attach_only:
+            # ATTACH: skip the idempotent header rewrite (and index
+            # re-initialize) for healthy already-loaded tables.
+            self.attach()
+            if self.hasindex:
+                self.index.attach()
+        else:
+            if ((self.mutex['isloaded']==0) | (not self.exists_local)):
+                self.download()
+            self.malloc(create=True)
-        if self.hasindex:
-            self.index.initialize()
-            if self.records.count>0:
-                # check if index is coherent
-                loc = self.records.get_loc(self.records[0:1], acquire=False)
-                if loc[0]!=0:
-                    # index is not coherent
-                    self.hdr['isidxcreated']=0
-                    self.index.initialize()
+            if self.hasindex:
+                self.index.initialize()
+                if self.records.count>0:
+                    # check if index is coherent
                     loc = self.records.get_loc(self.records[0:1], acquire=False)
                     if loc[0]!=0:
-                        raise Exception('Cannot create index!')
+                        # index is not coherent
+                        self.hdr['isidxcreated']=0
+                        self.index.initialize()
+                        loc = self.records.get_loc(self.records[0:1], acquire=False)
+                        if loc[0]!=0:
+                            raise Exception('Cannot create index!')
     def read_header(self, io_obj):
         """

{shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/TableDisk.py RENAMED Viewed

@@ -109,8 +109,45 @@ class TableDisk(Table):
         offset = self.hdr.dtype.itemsize
         self.shf_data = np.memmap(self.filepath,self.recdtype,'r+',offset,(self.hdr['recordssize'],))
         self.records = SharedNumpy('DISK', self.shf_data)
-        self.records.table = self
+        self.records.table = self
+    ############### ATTACH ###############
+    def attach(self):
+        # map an already-initialized table WITHOUT writing the shared header
+        """
+        Memory-map a table that another process already has loaded (mutex
+        isloaded==1) and may be actively writing.
+        Unlike malloc(create=True), this NEVER writes the shared header.
+        malloc does `self.shf_hdr[0] = self.hdr` (a whole-struct write of this
+        process's stale read_header() copy) plus `hdr['recordssize'] =
+        int(self.size)`; running that against a live table reverts the
+        `count`/`recordssize` the writing process just advanced — with no
+        mutex held — which makes the writer overwrite already-published rows
+        and lock-free readers see torn/garbage records. attach() only reads
+        the header and maps the file.
+        """
+        # memory map header — read the live header, never write it
+        self.shf_hdr = np.memmap(self.filepath, self.hdrdtype, 'r+', 0, (1,))
+        self.hdr = self.shf_hdr[0]
+        # adopt the live recordssize, not this process's stale read_header copy
+        self.size = int(self.hdr['recordssize'])
+        # the file must actually back header + recordssize rows before mapping
+        # data — guards against attaching a truncated / partially written file
+        offset = self.hdr.dtype.itemsize
+        expected_bytes = offset + self.recdtype.itemsize * int(self.hdr['recordssize'])
+        actual_bytes = os.path.getsize(self.filepath)
+        if actual_bytes < expected_bytes:
+            raise Exception('attach %s: file too small (%d < %d), table not coherent'
+                            % (self.relpath, actual_bytes, expected_bytes))
+        # memory map data
+        self.shf_data = np.memmap(self.filepath, self.recdtype, 'r+', offset,
+                                  (int(self.hdr['recordssize']),))
+        self.records = SharedNumpy('DISK', self.shf_data)
+        self.records.table = self
     ############### FREE ###############
     def free(self, acquire=True):
         """

{shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/TableIndex.py RENAMED Viewed

@@ -112,7 +112,44 @@ class TableIndex:
         except Exception as e:
             errmsg = 'Failed to intialize index for %s!\n%s' % (self.table.relpath, str(e))
             self.initialized = False
-        finally:
+        finally:
+            if not self.initialized:
+                Logger.log.error(errmsg)
+                raise Exception(errmsg)
+    def attach(self):
+        """
+        Attach to the already-built index of a table that another process has
+        loaded (mutex isloaded==1) and may be actively writing.
+        Unlike initialize(), attach() NEVER mutates shared state: it does not
+        call malloc()/malloc_pkey() (whose resetfile path truncates+recreates
+        the index file, zeroes it with pkey[:]=-1 and flips isidxcreated),
+        never runs create_index(), and never writes isidxcreated/hasindex on
+        the shared header. It derives the index file paths and rememmap()s the
+        existing files; rememmap() raises (rather than clobbers) on a size
+        mismatch, which is the correct, safe behaviour against a live table.
+        """
+        errmsg = ''
+        try:
+            self.get_functions()
+            # derive the index file paths exactly as malloc_pkey /
+            # malloc_dateidx / malloc_symbolidx / malloc_portfolioidx do
+            self.pkeypath = str(self.table.filepath).replace('data.bin', 'pkey.bin')
+            if 'date' in self.pkeystr:
+                self.dateidxpath = str(self.table.filepath).replace('data.bin', 'dateidx.bin')
+            if ('symbol' in self.pkeystr) & (len(self.pkeycolumns) > 1):
+                self.symbolidxpath = str(self.table.filepath).replace('data.bin', 'symbolidx.bin')
+            if ('portfolio' in self.pkeystr) | ('tag' in self.pkeystr):
+                self.portidxpath = str(self.table.filepath).replace('data.bin', 'portidx.bin')
+            # map the existing index files read-only; rememmap() raises on a
+            # size mismatch instead of resetting/rebuilding the live index
+            self.rememmap()
+            self.initialized = True
+        except Exception as e:
+            self.initialized = False
+            errmsg = 'Failed to attach index for %s!\n%s' % (self.table.relpath, str(e))
+        finally:
             if not self.initialized:
                 Logger.log.error(errmsg)
                 raise Exception(errmsg)

{shareddata-6.83.8 → shareddata-6.83.12/src/shareddata.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: shareddata
-Version: 6.83.8
+Version: 6.83.12
 Summary: Memory Mapped / Shared Memory Database with S3 repository
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: MIT License