shareddata 6.83.8__tar.gz → 6.83.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. {shareddata-6.83.8/src/shareddata.egg-info → shareddata-6.83.12}/PKG-INFO +1 -1
  2. {shareddata-6.83.8 → shareddata-6.83.12}/setup.py +1 -1
  3. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/CollectionMongoDB.py +1 -1
  4. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/Logger.py +58 -26
  5. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/Routines/WorkerPool.py +22 -13
  6. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/SharedData.py +88 -26
  7. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/Table.py +85 -19
  8. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/TableDisk.py +39 -2
  9. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/TableIndex.py +38 -1
  10. {shareddata-6.83.8 → shareddata-6.83.12/src/shareddata.egg-info}/PKG-INFO +1 -1
  11. {shareddata-6.83.8 → shareddata-6.83.12}/LICENSE +0 -0
  12. {shareddata-6.83.8 → shareddata-6.83.12}/MANIFEST.in +0 -0
  13. {shareddata-6.83.8 → shareddata-6.83.12}/README.md +0 -0
  14. {shareddata-6.83.8 → shareddata-6.83.12}/pyproject.toml +0 -0
  15. {shareddata-6.83.8 → shareddata-6.83.12}/setup.cfg +0 -0
  16. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/API/ServerGunicorn.py +0 -0
  17. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/API/ServerWaitress.py +0 -0
  18. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/API/__init__.py +0 -0
  19. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/API/auth.py +0 -0
  20. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/API/constants.py +0 -0
  21. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/API/routes/__init__.py +0 -0
  22. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/API/routes/cache.py +0 -0
  23. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/API/routes/collections.py +0 -0
  24. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/API/routes/metadata.py +0 -0
  25. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/API/routes/system.py +0 -0
  26. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/API/routes/tables.py +0 -0
  27. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/API/routes/timeseries.py +0 -0
  28. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/API/routes/workers.py +0 -0
  29. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/API/utils.py +0 -0
  30. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/CacheRedis.py +0 -0
  31. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/Database.py +0 -0
  32. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/Defaults.py +0 -0
  33. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/IO/AWSEC2.py +0 -0
  34. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/IO/AWSS3.py +0 -0
  35. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/IO/AutoDocstrings.py +0 -0
  36. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/IO/ClientAPI.py +0 -0
  37. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/IO/ClientSocket.py +0 -0
  38. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/IO/ClientWebSocket.py +0 -0
  39. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/IO/LogHandlerAPI.py +0 -0
  40. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/IO/MongoDBClient.py +0 -0
  41. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/IO/SaveTables.py +0 -0
  42. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/IO/ServerSocket.py +0 -0
  43. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/IO/ServerWebSocket.py +0 -0
  44. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/IO/StreamsCache.py +0 -0
  45. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/IO/StreamsPersist.py +0 -0
  46. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/IO/SyncTable.py +0 -0
  47. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/IO/TunnelWebSocket.py +0 -0
  48. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/IO/__init__.py +0 -0
  49. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/Metadata.py +0 -0
  50. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/MultiProc.py +0 -0
  51. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/OpenFIGI.py +0 -0
  52. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/Routines/BatchJob.py +0 -0
  53. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/Routines/Schedule.py +0 -0
  54. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/Routines/ScheduleMonitor.py +0 -0
  55. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/Routines/Scheduler.py +0 -0
  56. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/Routines/Worker.py +0 -0
  57. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/Routines/WorkerLib.py +0 -0
  58. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/Routines/__init__.py +0 -0
  59. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/SharedNumpy.py +0 -0
  60. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/StreamKafka.py +0 -0
  61. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/Symbol.py +0 -0
  62. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/TableIndexJitFunctions.py +0 -0
  63. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/TableIndexJitFunctionsManual.py +0 -0
  64. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/TableIndexJitGenerate.py +0 -0
  65. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/TableIndexJitHash.py +0 -0
  66. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/TableIndexJitLoc.py +0 -0
  67. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/TimeSeriesDisk.py +0 -0
  68. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/TimeseriesContainer.py +0 -0
  69. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/Users.py +0 -0
  70. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/Utils.py +0 -0
  71. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/__init__.py +0 -0
  72. {shareddata-6.83.8 → shareddata-6.83.12}/src/SharedData/sharedmutexwin.pyd +0 -0
  73. {shareddata-6.83.8 → shareddata-6.83.12}/src/shareddata.egg-info/SOURCES.txt +0 -0
  74. {shareddata-6.83.8 → shareddata-6.83.12}/src/shareddata.egg-info/dependency_links.txt +0 -0
  75. {shareddata-6.83.8 → shareddata-6.83.12}/src/shareddata.egg-info/requires.txt +0 -0
  76. {shareddata-6.83.8 → shareddata-6.83.12}/src/shareddata.egg-info/top_level.txt +0 -0
  77. {shareddata-6.83.8 → shareddata-6.83.12}/tests/test_api_collection.py +0 -0
  78. {shareddata-6.83.8 → shareddata-6.83.12}/tests/test_api_collection_loopback.py +0 -0
  79. {shareddata-6.83.8 → shareddata-6.83.12}/tests/test_api_table.py +0 -0
  80. {shareddata-6.83.8 → shareddata-6.83.12}/tests/test_api_table_schemaless.py +0 -0
  81. {shareddata-6.83.8 → shareddata-6.83.12}/tests/test_api_table_schemaless_extend.py +0 -0
  82. {shareddata-6.83.8 → shareddata-6.83.12}/tests/test_bson_last_pos_reuse.py +0 -0
  83. {shareddata-6.83.8 → shareddata-6.83.12}/tests/test_cache_redis.py +0 -0
  84. {shareddata-6.83.8 → shareddata-6.83.12}/tests/test_extend_rt.py +0 -0
  85. {shareddata-6.83.8 → shareddata-6.83.12}/tests/test_loc.py +0 -0
  86. {shareddata-6.83.8 → shareddata-6.83.12}/tests/test_metadata.py +0 -0
  87. {shareddata-6.83.8 → shareddata-6.83.12}/tests/test_read_write_tail.py +0 -0
  88. {shareddata-6.83.8 → shareddata-6.83.12}/tests/test_stream_loopback_async.py +0 -0
  89. {shareddata-6.83.8 → shareddata-6.83.12}/tests/test_timeseries.py +0 -0
  90. {shareddata-6.83.8 → shareddata-6.83.12}/tests/test_timeseries_api.py +0 -0
  91. {shareddata-6.83.8 → shareddata-6.83.12}/tests/test_upsert_unordered_get_date_loc_d1.py +0 -0
  92. {shareddata-6.83.8 → shareddata-6.83.12}/tests/test_upsert_unordered_get_date_loc_m1.py +0 -0
  93. {shareddata-6.83.8 → shareddata-6.83.12}/tests/test_upsert_unordered_get_date_loc_m15.py +0 -0
  94. {shareddata-6.83.8 → shareddata-6.83.12}/tests/test_upsert_unordered_positions_m1.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: shareddata
3
- Version: 6.83.8
3
+ Version: 6.83.12
4
4
  Summary: Memory Mapped / Shared Memory Database with S3 repository
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -34,7 +34,7 @@ install_requires = [
34
34
 
35
35
  setup(
36
36
  name='shareddata',
37
- version='6.83.8',
37
+ version='6.83.12',
38
38
  description='Memory Mapped / Shared Memory Database with S3 repository',
39
39
  long_description=open('README.md').read(),
40
40
  long_description_content_type='text/markdown',
@@ -112,7 +112,7 @@ class CollectionMongoDB:
112
112
  # Check for indexes other than the default _id index
113
113
  self.hasindex = any(index_name == pkey_name for index_name in index_info)
114
114
  # If index was requested but the collection was previously created without one, create it now
115
- if hasindex and not self.hasindex:
115
+ if hasindex and not self.hasindex and self.period in ['D1', 'M15', 'M1']:
116
116
  self.mongodb.client[self.user][self.relpath].create_index(pkey_fields, unique=True, name=pkey_name)
117
117
  self.mongodb.client[self.user][self.relpath].create_index([("mtime", DESCENDING)])
118
118
  self.hasindex = True
@@ -20,6 +20,36 @@ import json
20
20
  import requests
21
21
  import lz4
22
22
 
23
+
24
+ # Column layout of the semicolon-separated log files under DATABASE_FOLDER/Logs.
25
+ # 6 separators => 7 fields; the 7th (message) may itself contain ';'.
26
+ _LOG_COLUMNS = ['shardid', 'sequence_number', 'user_name',
27
+ 'asctime', 'logger_name', 'level', 'message']
28
+
29
+
30
+ def _parse_log_lines(text):
31
+ """Parse ';'-delimited log text into a DataFrame — stdlib only.
32
+
33
+ Uses str.split(';', 6) so any ';' inside the message field is preserved
34
+ verbatim (matching the prior `shape[1] > 7 -> merge cols 6..` behavior
35
+ that the pandas-based readLogs used to handle that case).
36
+
37
+ No pandas C parser is involved, so this cannot trip the pandas_parser
38
+ SIGSEGV that Logger.readLogs kept hitting at base_parser._infer_types.
39
+ Lines with fewer than 7 fields or empty asctime are dropped, matching
40
+ the previous on_bad_lines='skip' + dfnewlines[asctime].notna() filter.
41
+ """
42
+ rows = []
43
+ append = rows.append
44
+ for line in text.splitlines():
45
+ if not line:
46
+ continue
47
+ parts = line.split(';', 6)
48
+ if len(parts) < 7 or not parts[3]:
49
+ continue
50
+ append(parts)
51
+ return pd.DataFrame(rows, columns=_LOG_COLUMNS)
52
+
23
53
  from SharedData.IO.LogHandlerAPI import LogHandlerAPI
24
54
 
25
55
 
@@ -287,13 +317,11 @@ class Logger:
287
317
  ((pd.Timestamp.utcnow() + timedelta(days=-1)).strftime('%Y%m%d')+'.log')
288
318
  if lastlogfilepath.is_file():
289
319
  try:
290
- _dflogs = pd.read_csv(lastlogfilepath, header=None, sep=';',
291
- engine='python', on_bad_lines='skip')
292
- _dflogs.columns = ['shardid', 'sequence_number',
293
- 'user_name', 'asctime', 'logger_name', 'level', 'message']
320
+ with open(lastlogfilepath, 'r', encoding='utf-8', errors='replace') as _f:
321
+ _dflogs = _parse_log_lines(_f.read())
294
322
  Logger.dflogs = pd.concat([_dflogs, Logger.dflogs], axis=0, ignore_index=True)
295
323
  Logger.getLastLog(Logger.dflogs)
296
- Logger.getStatus(Logger.dflogs)
324
+ Logger.getStatus(Logger.dflogs)
297
325
  except Exception as e:
298
326
  print(f'Error reading last day logs: {e}')
299
327
 
@@ -355,24 +383,29 @@ class Logger:
355
383
  Logger.logfilepath = _logfilepath
356
384
 
357
385
  if Logger.logfilepath.is_file():
358
-
359
- with open(Logger.logfilepath, 'r') as file:
386
+
387
+ # Binary mode so _logfileposition is a plain byte offset with
388
+ # well-defined seek() semantics (text-mode tell() cookies cannot
389
+ # be manipulated arithmetically).
390
+ with open(Logger.logfilepath, 'rb') as file:
360
391
  file.seek(Logger._logfileposition)
361
- newlines = '\n'.join(file.readlines())
362
- if not newlines.strip(): # fix: prevent pd.read_csv crash on empty string
392
+ raw = file.read()
393
+ # Only consume up to the last complete line. A trailing
394
+ # partial (writer still mid-flush, or a kill -9 that left a
395
+ # half-line behind) is left in place so the next tick picks
396
+ # it up once '\n' is written. This is stricter than the old
397
+ # pandas path, which accepted the partial and advanced past
398
+ # it, silently losing the remainder bytes.
399
+ last_nl = raw.rfind(b'\n')
400
+ if last_nl < 0:
363
401
  return dfnewlines
364
- dfnewlines = pd.read_csv(StringIO(newlines), header=None, sep=';',
365
- engine='python', on_bad_lines='skip')
366
- if dfnewlines.shape[1] > 7:
367
- # Merge all columns from 6 onward into a single message
368
- message = dfnewlines.iloc[:, 6:].apply(lambda x: ';'.join(x.dropna().astype(str)), axis=1)
369
- dfnewlines = dfnewlines.iloc[:, :7]
370
- dfnewlines.iloc[:, 6] = message
371
-
372
- dfnewlines.columns = [
373
- 'shardid', 'sequence_number', 'user_name', 'asctime', 'logger_name', 'level', 'message'
374
- ]
375
- dfnewlines = dfnewlines[dfnewlines['asctime'].notna()]
402
+ complete_bytes = raw[:last_nl + 1]
403
+ text = complete_bytes.decode('utf-8', errors='replace')
404
+ Logger._logfileposition += len(complete_bytes)
405
+ # _parse_log_lines drops empty lines and rows whose asctime
406
+ # field is empty, folding the prior on_bad_lines='skip' +
407
+ # dfnewlines[asctime].notna() filter into the parse step.
408
+ dfnewlines = _parse_log_lines(text)
376
409
  if dfnewlines.empty:
377
410
  return dfnewlines
378
411
 
@@ -401,12 +434,11 @@ class Logger:
401
434
  Logger._max_asctime = max(
402
435
  max_asctime,
403
436
  dfnewlines_sorted['asctime'].iloc[-1]
404
- )
405
- Logger._logfileposition = file.tell()
406
-
407
-
437
+ )
438
+ # _logfileposition was already advanced above, right after we
439
+ # decided which prefix of the read to consume.
408
440
  return dfnewlines_sorted
409
-
441
+
410
442
  return dfnewlines
411
443
 
412
444
  @staticmethod
@@ -519,18 +519,27 @@ class WorkerPool:
519
519
  time.sleep(60)
520
520
 
521
521
  @staticmethod
522
- def get_active_jobs(shdata):
523
- # keep track of active jobs
522
+ def get_active_jobs(shdata, acquire=True):
523
+ # Schemaless reads must hold the table mutex: the writer updates
524
+ # bson bytes, bson_size and bson_ptr as separate non-atomic steps,
525
+ # and a lock-free reader can catch mid-update state and fail to
526
+ # decode BSON ("bad eoo that should never happen").
524
527
  job_table = WorkerPool.get_job_table(shdata)
525
- buff = np.full((1,),dtype=job_table.dtype,fill_value=np.nan)
526
- buff['hash'][0] = 'ACTIVE_JOBS'
527
- loc = job_table.get_loc(buff, acquire=False)
528
- if loc[0] != -1:
529
- active_jobs = job_table.get_dict_list(job_table[loc[0]],acquire=False)[0]
530
- else:
531
- active_jobs = {
532
- 'hash': 'ACTIVE_JOBS',
533
- 'status': 'ACTIVE',
534
- 'jobs': {},
535
- }
528
+ try:
529
+ if acquire:
530
+ job_table.acquire()
531
+ buff = np.full((1,),dtype=job_table.dtype,fill_value=np.nan)
532
+ buff['hash'][0] = 'ACTIVE_JOBS'
533
+ loc = job_table.get_loc(buff, acquire=False)
534
+ if loc[0] != -1:
535
+ active_jobs = job_table.get_dict_list(job_table[loc[0]],acquire=False)[0]
536
+ else:
537
+ active_jobs = {
538
+ 'hash': 'ACTIVE_JOBS',
539
+ 'status': 'ACTIVE',
540
+ 'jobs': {},
541
+ }
542
+ finally:
543
+ if acquire:
544
+ job_table.release()
536
545
  return active_jobs
@@ -541,14 +541,34 @@ class SharedData:
541
541
  names=names,formats=formats,size=100e3)
542
542
 
543
543
  @staticmethod
544
- def mutex(shm_name, pid):
544
+ def mutex(shm_name, pid):
545
545
  """
546
- Create or attach to a shared memory mutex object and register the process ID.
547
-
546
+ Allocate (or attach to) the per-table MUTEX shared-memory struct AND
547
+ ACQUIRE its `pid` lock. Returns once the lock is held by `pid`.
548
+
549
+ =====================================================================
550
+ IMPORTANT — this ACQUIRES the per-table mutex as a side effect.
551
+ =====================================================================
552
+ Called from `Table.init_schema()`; `Table.__init__` then keeps the
553
+ lock held for the remainder of `__init__`, releasing it only in the
554
+ `finally: self.release()` at the bottom. Every code path reachable
555
+ from `Table.__init__` — incl. `create_table()`, `load_table()`,
556
+ `read_header()`, `download()`, `TableDisk.malloc()`,
557
+ `index.initialize()`, `index.malloc_pkey()`, `create_index()` —
558
+ runs with this table's own mutex already held. Do NOT call
559
+ `acquire()` (or anything that internally re-acquires this same
560
+ mutex, e.g. `self.acquire()` on the same Table) from anywhere in
561
+ that call tree: the mutex is a non-reentrant CAS spinlock and
562
+ re-acquiring it self-deadlocks (the staleness check in `acquire()`
563
+ only force-acquires when the holder PID is *dead* — a self-
564
+ deadlocked caller is still alive).
565
+
566
+ Original docstring follows:
567
+
548
568
  This static method attempts to create a shared memory segment with a specific name
549
569
  appended by '#mutex' to serve as a mutex structure containing process ID, type, and load status.
550
570
  If the shared memory segment already exists, it attaches to it instead.
551
-
571
+
552
572
  It then acquires the mutex using the SharedData.acquire method, registers the calling process ID
553
573
  by appending it to a CSV file located in the DATABASE_FOLDER environment path, and returns
554
574
  the shared memory object, the mutex numpy structured array, and a flag indicating whether the
@@ -588,8 +608,43 @@ class SharedData:
588
608
  @staticmethod
589
609
  def acquire(mutex, pid, relpath):
590
610
  """
611
+ Spin-CAS acquire of a per-table MUTEX's `pid` field (1us sleep loop).
612
+ Atomically sets `mutex.pid = pid` once it observes `mutex.pid == 0`.
613
+ Staleness recovery: if the current holder PID is *dead*
614
+ (`psutil.pid_exists` is False), force-acquires by CASing
615
+ `holder_pid -> pid`.
616
+
617
+ =====================================================================
618
+ *** NON-REENTRANT ***
619
+ =====================================================================
620
+ This is a CAS-based spinlock with NO owner-recursion tracking. If the
621
+ current process already holds the mutex, calling `acquire()` again
622
+ SELF-DEADLOCKS: the CAS against 0 never succeeds (the field holds our
623
+ own pid, not 0), and the staleness check only force-acquires when the
624
+ holder is DEAD — which a self-deadlocked caller is not.
625
+
626
+ Note especially that `Table.__init__` holds this mutex throughout its
627
+ lifetime (acquired in `init_schema()` via `SharedData.mutex()`,
628
+ released only at `__init__`'s `finally`). Code reachable from
629
+ `__init__` — `load_table()`, `create_table()`, `read_header()`,
630
+ `TableDisk.malloc()`, `TableIndex.initialize()`, `create_index()`,
631
+ etc. — must NOT call `acquire()` on the table's own mutex.
632
+
633
+ =====================================================================
634
+ *** BRIEF-HOLD CONTRACT ***
635
+ =====================================================================
636
+ The mutex protects short shared-memory critical sections (one upsert /
637
+ one batch_extend / one get_loc / the `__init__` setup) and is shared
638
+ across Python AND C++ writers. Do NOT hold it across slow I/O — every
639
+ waiter spins at 1us granularity, burning CPU. On the singleton SCHEMA
640
+ table this is catastrophic: stalling SCHEMA stalls every process that
641
+ opens any table (because every Table open resolves its folder via
642
+ `schema.get_loc()`).
643
+
644
+ Original docstring follows:
645
+
591
646
  Attempt to acquire a process-safe mutex semaphore by atomically setting its value to the current process ID (pid).
592
-
647
+
593
648
  This method uses a compare-and-swap atomic operation to acquire the mutex. If the mutex is already held by another process, it waits in a loop, checking every microsecond. If the mutex remains held beyond certain time thresholds (1 second on the first check, 15 seconds thereafter), it verifies whether the locking process is still active. If the locking process has terminated, it forcibly acquires the mutex. The method logs a warning if waiting for the semaphore continues beyond the initial timeout.
594
649
 
595
650
  Parameters:
@@ -601,12 +656,27 @@ class SharedData:
601
656
  None
602
657
  """
603
658
  tini = time.time()
659
+ tstart = tini # total-wait timer; unlike tini it is never reset below
660
+ # The SCHEMA table is the singleton every process depends on (every
661
+ # Table open resolves its folder via schema.get_loc()). The staleness
662
+ # check below only recovers a DEAD holder — an alive-but-stuck holder
663
+ # (e.g. a re-entrant acquire) would otherwise hang every dependent
664
+ # process forever. So for the SCHEMA table only, abort with an error
665
+ # after 30s instead of spinning indefinitely.
666
+ schema_timeout = 30.0 if '/SCHEMA/table/' in relpath else None
604
667
  # semaphore is process safe
605
668
  telapsed = 0
606
669
  hdrptr = mutex.__array_interface__['data'][0]
607
670
  semseek = 0
608
671
  firstcheck = True
609
672
  while cpp.long_compare_and_swap(hdrptr, semseek, 0, pid) == 0:
673
+ if (schema_timeout is not None) and (time.time() - tstart > schema_timeout):
674
+ lockingpid = int(mutex['pid'])
675
+ errmsg = ('%s SCHEMA semaphore still held by pid %d after %.0fs — '
676
+ 'aborting acquire to avoid hanging every dependent process'
677
+ % (relpath, lockingpid, schema_timeout))
678
+ Logger.log.error(errmsg)
679
+ raise TimeoutError(errmsg)
610
680
  # check if process that locked the mutex is still running
611
681
  telapsed = time.time() - tini
612
682
  if (telapsed > 15) | ((firstcheck) & (telapsed > 1)):
@@ -771,15 +841,11 @@ class SharedData:
771
841
 
772
842
  dfcollections = self.list_collections(keyword,user=user)
773
843
 
774
- ls = dfremote.copy()
775
- # merge local
776
- ls = ls.reindex(index=ls.index.union(dflocal.index),
777
- columns=ls.columns.union(dflocal.columns))
778
- ls.loc[dflocal.index,dflocal.columns] = dflocal.values
779
- # merge collections
780
- ls = ls.reindex(index=ls.index.union(dfcollections.index),
781
- columns=ls.columns.union(dfcollections.columns))
782
- ls.loc[dfcollections.index,dfcollections.columns] = dfcollections.values
844
+ # merge local then collections; combine_first preserves dtypes (avoids
845
+ # FutureWarning from assigning object/datetime values into reindexed
846
+ # float64 NaN columns).
847
+ ls = dflocal.combine_first(dfremote)
848
+ ls = dfcollections.combine_first(ls)
783
849
 
784
850
  if len(ls)>0:
785
851
  ls = ls.reindex(columns=['folder_local', 'created_local', 'last_modified_local','size_local','files_local',
@@ -990,12 +1056,10 @@ class SharedData:
990
1056
  dflocal = self.list_local(keyword,user=user)
991
1057
  Logger.log.info(f'list_local took {time.time()-ti:.2f} seconds')
992
1058
 
993
- ls = dfremote.copy()
994
- # merge local
995
- ls = ls.reindex(index=ls.index.union(dflocal.index),
996
- columns=ls.columns.union(dflocal.columns))
997
- ls.loc[dflocal.index,dflocal.columns] = dflocal.values
998
-
1059
+ # combine_first preserves dtypes (avoids FutureWarning from assigning
1060
+ # object/datetime values into reindexed float64 NaN columns).
1061
+ ls = dflocal.combine_first(dfremote)
1062
+
999
1063
  if len(ls)>0:
1000
1064
  ls = ls.reindex(columns=['folder_local', 'created_local', 'last_modified_local','size_local','files_local',
1001
1065
  'folder_remote', 'last_modified_remote', 'size_remote', 'files_remote','storage_class_remote',
@@ -1049,12 +1113,10 @@ class SharedData:
1049
1113
  dfremote = self.list_remote(keyword, user=user)
1050
1114
  dflocal = self.list_local(keyword, user=user)
1051
1115
 
1052
- ls = dfremote.copy()
1053
- # merge local
1054
- ls = ls.reindex(index=ls.index.union(dflocal.index),
1055
- columns=ls.columns.union(dflocal.columns))
1056
- ls.loc[dflocal.index, dflocal.columns] = dflocal.values
1057
-
1116
+ # combine_first preserves dtypes (avoids FutureWarning from assigning
1117
+ # object/datetime values into reindexed float64 NaN columns).
1118
+ ls = dflocal.combine_first(dfremote)
1119
+
1058
1120
  if len(ls) > 0:
1059
1121
  ls = ls.reindex(columns=['folder_local', 'created_local', 'last_modified_local','size_local','files_local',
1060
1122
  'folder_remote', 'last_modified_remote', 'size_remote', 'files_remote','storage_class_remote',
@@ -146,10 +146,20 @@ class Table:
146
146
  self.index = TableIndex(self)
147
147
 
148
148
  errmsg = ''
149
- try:
150
-
149
+ try:
150
+ # init_schema() ACQUIRES this table's per-table mutex (via
151
+ # SharedData.mutex() -> SharedData.acquire()) and the lock is
152
+ # held until `finally: self.release()` at the bottom of this
153
+ # try/except. Every method called below — create_table(),
154
+ # load_table(), read_header(), TableDisk.malloc(),
155
+ # index.initialize(), index.create_index() — therefore runs
156
+ # WITH the mutex already held. Do NOT call self.acquire() (or
157
+ # anything that re-acquires this same mutex) from anywhere
158
+ # reachable from here: the mutex is a non-reentrant CAS
159
+ # spinlock and re-acquiring it self-deadlocks. See the
160
+ # SharedData.mutex / SharedData.acquire docstrings.
151
161
  self.init_schema()
152
-
162
+
153
163
  if (not self.exists) | (self.overwrite):
154
164
 
155
165
  if not records is None:
@@ -190,8 +200,18 @@ class Table:
190
200
 
191
201
  def init_schema(self):
192
202
  """
193
- Initialize the schema and related attributes for the table instance.
194
-
203
+ Initialize the schema and related attributes for the table instance,
204
+ AND ACQUIRE this table's per-table mutex (via `SharedData.mutex()` ->
205
+ `SharedData.acquire()`). The mutex is held from here until
206
+ `Table.__init__`'s `finally: self.release()` at the bottom — so
207
+ every method called from `__init__` (create_table, load_table,
208
+ read_header, malloc, index.initialize, ...) runs with the mutex
209
+ already held. It is non-reentrant; do not re-acquire it from
210
+ anywhere reachable from `__init__`. See `SharedData.mutex` /
211
+ `SharedData.acquire` docstrings for the full contract.
212
+
213
+ Original docstring follows:
214
+
195
215
  This method sets up header and tail header formats and names, initializes mutexes for shared memory access,
196
216
  determines the storage path based on user, database, period, source, and tablename, and manages schema information
197
217
  from shareddata. It handles synchronization and consistency checks for the table's loading type, updates or inserts
@@ -412,6 +432,14 @@ class Table:
412
432
  def create_table(self):
413
433
  # create new table or overwrite existing table
414
434
  """
435
+ Called from `Table.__init__` for a new or overwrite-requested table.
436
+ Runs WITH this table's per-table mutex already held — `init_schema()`
437
+ acquired it before this is called, and it is released only in
438
+ `__init__`'s `finally`. Therefore TableDisk.malloc(create=True)'s
439
+ `shf_hdr[0] = self.hdr` write below is serialized against C++ writers
440
+ (which acquire the same per-table mutex for `batch_extend`/`upsert`)
441
+ — it is NOT a lock-free clobber. Do NOT acquire the mutex again here.
442
+
415
443
  Create a new table or overwrite an existing one with the provided records.
416
444
 
417
445
  This method performs the following steps:
@@ -733,6 +761,19 @@ class Table:
733
761
  def load_table(self):
734
762
  # open existing table
735
763
  """
764
+ Called from `Table.__init__` for an existing table. Runs WITH this
765
+ table's per-table mutex already held — `init_schema()` acquired it
766
+ before this is called, and it is released only in `__init__`'s
767
+ `finally`. Therefore `read_header()` reads a quiescent header (no
768
+ C++ writer can mutate it concurrently — they take the same mutex
769
+ for `batch_extend`/`upsert`), and `TableDisk.malloc(create=True)`'s
770
+ `shf_hdr[0] = self.hdr` writes back the SAME bytes under the SAME
771
+ lock — it is idempotent, NOT a clobber. Do NOT acquire the mutex
772
+ again here: it is non-reentrant and re-acquire self-deadlocks. See
773
+ `SharedData.mutex` / `SharedData.acquire` docstrings.
774
+
775
+ Original docstring follows:
776
+
736
777
  Load data into the table, ensuring the local file is available and up to date.
737
778
 
738
779
  If a local table file exists, it reads the header from the file. If the table is not loaded or the local file does not exist, it downloads the table data. Memory allocation for the table is then performed.
@@ -746,23 +787,48 @@ class Table:
746
787
  with open(self.filepath, 'rb') as io_obj:
747
788
  self.read_header(io_obj)
748
789
 
749
- if ((self.mutex['isloaded']==0) | (not self.exists_local)):
750
- self.download()
751
-
752
- self.malloc(create=True)
790
+ # NOTE: this attach/cold-init split is an OPTIMIZATION ONLY — both
791
+ # paths produce equivalent state and both are concurrency-safe.
792
+ # `Table.__init__` holds this table's per-table mutex throughout
793
+ # (acquired in `init_schema()`, released in `__init__`'s `finally`),
794
+ # and C++ writers acquire the same mutex for `batch_extend`/`upsert`,
795
+ # so any concurrent C++ writer is blocked while we are in here.
796
+ # Therefore the historical `malloc(create=True)`'s
797
+ # `shf_hdr[0] = self.hdr` is idempotent — it writes back exactly the
798
+ # bytes `read_header()` just read under the same lock — and is NOT a
799
+ # live-writer clobber. The ATTACH path simply skips that idempotent
800
+ # rewrite (and the no-op `index.initialize()` on a healthy live
801
+ # table). Earlier comments here claimed `malloc(create=True)` would
802
+ # clobber a live writer; that premise was wrong (the mutex
803
+ # serializes them). Do NOT call `self.acquire()` anywhere below —
804
+ # the mutex is non-reentrant and re-acquire self-deadlocks. See
805
+ # `SharedData.mutex` / `SharedData.acquire` docstrings.
806
+ attach_only = (self.exists_local
807
+ and (self.mutex['isloaded'] == 1)
808
+ and (not self.header_changed))
809
+ if attach_only:
810
+ # ATTACH: skip the idempotent header rewrite (and index
811
+ # re-initialize) for healthy already-loaded tables.
812
+ self.attach()
813
+ if self.hasindex:
814
+ self.index.attach()
815
+ else:
816
+ if ((self.mutex['isloaded']==0) | (not self.exists_local)):
817
+ self.download()
818
+ self.malloc(create=True)
753
819
 
754
- if self.hasindex:
755
- self.index.initialize()
756
- if self.records.count>0:
757
- # check if index is coherent
758
- loc = self.records.get_loc(self.records[0:1], acquire=False)
759
- if loc[0]!=0:
760
- # index is not coherent
761
- self.hdr['isidxcreated']=0
762
- self.index.initialize()
820
+ if self.hasindex:
821
+ self.index.initialize()
822
+ if self.records.count>0:
823
+ # check if index is coherent
763
824
  loc = self.records.get_loc(self.records[0:1], acquire=False)
764
825
  if loc[0]!=0:
765
- raise Exception('Cannot create index!')
826
+ # index is not coherent
827
+ self.hdr['isidxcreated']=0
828
+ self.index.initialize()
829
+ loc = self.records.get_loc(self.records[0:1], acquire=False)
830
+ if loc[0]!=0:
831
+ raise Exception('Cannot create index!')
766
832
 
767
833
  def read_header(self, io_obj):
768
834
  """
@@ -109,8 +109,45 @@ class TableDisk(Table):
109
109
  offset = self.hdr.dtype.itemsize
110
110
  self.shf_data = np.memmap(self.filepath,self.recdtype,'r+',offset,(self.hdr['recordssize'],))
111
111
  self.records = SharedNumpy('DISK', self.shf_data)
112
- self.records.table = self
113
-
112
+ self.records.table = self
113
+
114
+ ############### ATTACH ###############
115
+ def attach(self):
116
+ # map an already-initialized table WITHOUT writing the shared header
117
+ """
118
+ Memory-map a table that another process already has loaded (mutex
119
+ isloaded==1) and may be actively writing.
120
+
121
+ Unlike malloc(create=True), this NEVER writes the shared header.
122
+ malloc does `self.shf_hdr[0] = self.hdr` (a whole-struct write of this
123
+ process's stale read_header() copy) plus `hdr['recordssize'] =
124
+ int(self.size)`; running that against a live table reverts the
125
+ `count`/`recordssize` the writing process just advanced — with no
126
+ mutex held — which makes the writer overwrite already-published rows
127
+ and lock-free readers see torn/garbage records. attach() only reads
128
+ the header and maps the file.
129
+ """
130
+ # memory map header — read the live header, never write it
131
+ self.shf_hdr = np.memmap(self.filepath, self.hdrdtype, 'r+', 0, (1,))
132
+ self.hdr = self.shf_hdr[0]
133
+ # adopt the live recordssize, not this process's stale read_header copy
134
+ self.size = int(self.hdr['recordssize'])
135
+
136
+ # the file must actually back header + recordssize rows before mapping
137
+ # data — guards against attaching a truncated / partially written file
138
+ offset = self.hdr.dtype.itemsize
139
+ expected_bytes = offset + self.recdtype.itemsize * int(self.hdr['recordssize'])
140
+ actual_bytes = os.path.getsize(self.filepath)
141
+ if actual_bytes < expected_bytes:
142
+ raise Exception('attach %s: file too small (%d < %d), table not coherent'
143
+ % (self.relpath, actual_bytes, expected_bytes))
144
+
145
+ # memory map data
146
+ self.shf_data = np.memmap(self.filepath, self.recdtype, 'r+', offset,
147
+ (int(self.hdr['recordssize']),))
148
+ self.records = SharedNumpy('DISK', self.shf_data)
149
+ self.records.table = self
150
+
114
151
  ############### FREE ###############
115
152
  def free(self, acquire=True):
116
153
  """
@@ -112,7 +112,44 @@ class TableIndex:
112
112
  except Exception as e:
113
113
  errmsg = 'Failed to intialize index for %s!\n%s' % (self.table.relpath, str(e))
114
114
  self.initialized = False
115
- finally:
115
+ finally:
116
+ if not self.initialized:
117
+ Logger.log.error(errmsg)
118
+ raise Exception(errmsg)
119
+
120
+ def attach(self):
121
+ """
122
+ Attach to the already-built index of a table that another process has
123
+ loaded (mutex isloaded==1) and may be actively writing.
124
+
125
+ Unlike initialize(), attach() NEVER mutates shared state: it does not
126
+ call malloc()/malloc_pkey() (whose resetfile path truncates+recreates
127
+ the index file, zeroes it with pkey[:]=-1 and flips isidxcreated),
128
+ never runs create_index(), and never writes isidxcreated/hasindex on
129
+ the shared header. It derives the index file paths and rememmap()s the
130
+ existing files; rememmap() raises (rather than clobbers) on a size
131
+ mismatch, which is the correct, safe behaviour against a live table.
132
+ """
133
+ errmsg = ''
134
+ try:
135
+ self.get_functions()
136
+ # derive the index file paths exactly as malloc_pkey /
137
+ # malloc_dateidx / malloc_symbolidx / malloc_portfolioidx do
138
+ self.pkeypath = str(self.table.filepath).replace('data.bin', 'pkey.bin')
139
+ if 'date' in self.pkeystr:
140
+ self.dateidxpath = str(self.table.filepath).replace('data.bin', 'dateidx.bin')
141
+ if ('symbol' in self.pkeystr) & (len(self.pkeycolumns) > 1):
142
+ self.symbolidxpath = str(self.table.filepath).replace('data.bin', 'symbolidx.bin')
143
+ if ('portfolio' in self.pkeystr) | ('tag' in self.pkeystr):
144
+ self.portidxpath = str(self.table.filepath).replace('data.bin', 'portidx.bin')
145
+ # map the existing index files read-only; rememmap() raises on a
146
+ # size mismatch instead of resetting/rebuilding the live index
147
+ self.rememmap()
148
+ self.initialized = True
149
+ except Exception as e:
150
+ self.initialized = False
151
+ errmsg = 'Failed to attach index for %s!\n%s' % (self.table.relpath, str(e))
152
+ finally:
116
153
  if not self.initialized:
117
154
  Logger.log.error(errmsg)
118
155
  raise Exception(errmsg)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: shareddata
3
- Version: 6.83.8
3
+ Version: 6.83.12
4
4
  Summary: Memory Mapped / Shared Memory Database with S3 repository
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: License :: OSI Approved :: MIT License
File without changes
File without changes
File without changes
File without changes
File without changes