PyPI - pydcache - Versions diffs - 0.1.6__py3-none-any.whl - Mend

pydcache 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

pydcache/__init__.py +3 -0
pydcache/config/__init__.py +79 -0
pydcache/config/cta-nanny.service +32 -0
pydcache/config/pydcache.yaml +29 -0
pydcache/py.typed +0 -0
pydcache/scripts/__init__.py +0 -0
pydcache/scripts/cta_nanny.py +383 -0
pydcache/scripts/install_service.py +97 -0
pydcache/util/__init__.py +3 -0
pydcache/util/admin.py +174 -0
pydcache/util/kerberos.py +54 -0
pydcache/util/ostools.py +51 -0
pydcache/util/paramiko.py +40 -0
pydcache/util/psycopg.py +165 -0
pydcache-0.1.6.dist-info/METADATA +47 -0
pydcache-0.1.6.dist-info/RECORD +20 -0
pydcache-0.1.6.dist-info/WHEEL +5 -0
pydcache-0.1.6.dist-info/entry_points.txt +2 -0
pydcache-0.1.6.dist-info/licenses/LICENSE +661 -0
pydcache-0.1.6.dist-info/top_level.txt +1 -0

pydcache/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""pydcache – collection of dCache Python utilities."""
+__version__ = "0.1.6"

pydcache/config/__init__.py ADDED Viewed

@@ -0,0 +1,79 @@
+"""pydcache.config – configuration file deployment helpers."""
+from __future__ import annotations
+import logging
+import os
+import shutil
+import stat
+from pathlib import Path
+logger = logging.getLogger(__name__)
+# Bundled example / template config shipped inside the package
+_TEMPLATE = Path(__file__).parent / "pydcache.yaml"
+# Environment variable that overrides automatic path resolution
+_ENV_VAR = "DCACHE_CONFIG"
+def _default_config_path() -> Path:
+    """Return the platform-appropriate config path for the current user.
+    * root (uid 0)  → /etc/pydcache/pydcache.yaml
+    * regular user  → ~/.config/pydcache/pydcache.yaml  (XDG Base Dir spec)
+    """
+    if os.getuid() == 0:
+        return Path("/etc/pydcache/pydcache.yaml")
+    xdg_config = Path(os.environ.get("XDG_CONFIG_HOME", Path.home() / ".config"))
+    return xdg_config / "pydcache" / "pydcache.yaml"
+def get_config_path() -> Path:
+    """Return the resolved config path, deploying the template if needed.
+    Resolution order:
+    1. ``$DCACHE_CONFIG`` environment variable (no auto-deploy, must exist).
+    2. Platform default (``/etc/pydcache/`` or ``~/.config/pydcache/``);
+       the bundled template is copied there on first use.
+    The deployed file is always created with mode **0600**.
+    Returns:
+        Path to the config file to load.
+    Raises:
+        FileNotFoundError: If ``$DCACHE_CONFIG`` is set but does not exist.
+    """
+    env_override = os.environ.get(_ENV_VAR)
+    if env_override:
+        path = Path(env_override)
+        if not path.exists():
+            raise FileNotFoundError(
+                f"{_ENV_VAR}={path} — file does not exist"
+            )
+        return path
+    path = _default_config_path()
+    if not path.exists():
+        _deploy_template(path)
+    return path
+def _deploy_template(dest: Path) -> None:
+    """Copy the bundled template to *dest* and set permissions to 0600.
+    Args:
+        dest: Destination path (parent directories are created as needed).
+    Raises:
+        OSError: If the directory cannot be created or the file written.
+    """
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    shutil.copy2(_TEMPLATE, dest)
+    dest.chmod(stat.S_IRUSR | stat.S_IWUSR)  # 0600
+    logger.info(
+        "Deployed example config to %s — please fill in your credentials.",
+        dest,
+    )

pydcache/config/cta-nanny.service ADDED Viewed

@@ -0,0 +1,32 @@
+[Unit]
+Description=CTA Nanny – dCache/CTA duplicate-key repair daemon
+Documentation=https://github.com/DmitryLitvintsev/pydcache
+After=network-online.target
+Wants=network-online.target
+[Service]
+Type=simple
+User=root
+Group=root
+# Path to your filled-in config (copy from /etc/pydcache/pydcache.yaml and edit)
+Environment="DCACHE_CONFIG=/etc/pydcache/pydcache.yaml"
+# Adjust --cpu-count and --instance to match your site
+ExecStart=CTA_NANNY_EXEC --cpu-count 10 --instance public_prd
+# Restart policy
+Restart=on-failure
+RestartSec=30s
+# Resource limits
+LimitNOFILE=65536
+# Send stdout/stderr to the journal
+StandardOutput=journal
+StandardError=journal
+SyslogIdentifier=cta-nanny
+[Install]
+WantedBy=multi-user.target

pydcache/config/pydcache.yaml ADDED Viewed

@@ -0,0 +1,29 @@
+# cta_nanny configuration example
+#
+# Copy this file to a secure location, fill in your real values,
+# and point cta-nanny at it via the DCACHE_CONFIG environment variable:
+#
+#   export DCACHE_CONFIG=/path/to/cta_nanny.yaml
+#
+# WARNING: this file contains credentials — do NOT commit it to version control.
+# PostgreSQL connection URIs (require read/write access)
+# Format: postgresql://<user>:<password>@<host>:<port>/<database>
+cta_db: "postgresql://cta_user:changeme@cta-db.example.com:5432/cta_prd"
+chimera_db: "postgresql://chimera_user:changeme@chimera-db.example.com:5432/chimera"
+# dCache admin SSH shell
+admin:
+  host: dcache-admin.example.com
+  port: 22224
+  user: dcache_admin_user
+# Kafka consumer
+kafka:
+  # Topic produced by cta-taped ingest logs
+  topic: "ingest.logs.cta.taped"
+  # topic: "ingest.dcache.billing"
+  group: "dcache-ctananny-consumer-group"
+  # group: "dcache-public-stores"
+  bootstrap_servers: "kafka-broker.example.com:9092"

pydcache/py.typed ADDED Viewed

File without changes

pydcache/scripts/__init__.py ADDED Viewed

File without changes

pydcache/scripts/cta_nanny.py ADDED Viewed

@@ -0,0 +1,383 @@
+import argparse
+import json
+import logging
+import logging.config
+import multiprocessing
+import os
+import re
+import socket
+import sys
+import time
+import traceback
+from multiprocessing import Lock, Process, Queue
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, NoReturn
+from urllib.parse import urlparse
+import paramiko
+import psycopg2
+import psycopg2.extensions
+import psycopg2.extras
+import yaml
+from pydcache.util import admin, kerberos, ostools, paramiko, psycopg
+from pydcache.config import get_config_path
+from kafka import KafkaConsumer
+"""
+epoch_time\":1775496237.108132665,\"local_time\":\"2026-04-06T12:23:57-0500\",\"hostname\":\"tpsrvf2104\",\"program\":\"cta-taped\",\"log_level\":\"ERROR\",\"pid\":1160350,\"tid\":1638518,\"message\":\"In ArchiveMount::reportJobsBatchTransferred(): got an exception\",\"drive_name\":\"F1_F9B5D4\",\"instance\":\"prd\",\"sched_backend\":\"cephUser\",\"thread\":\"MainThread\",\"tapeDrive\":\"F1_F9B5D4\",\"mountId\":\"453120\",\"vo\":\"cms\",\"tapePool\":\"cms.Run2025DPrompt\",\"successfulBatchSize\":7,\"exceptionMessageValue\":\"commit problem committing the DB transaction: Database library reported: ERROR:  duplicate key value violates unique constraint \"archive_file_din_dfi_un\"DETAIL:  Key (disk_instance_name, disk_file_id)=(cms_prd, 00003DE869D8B37F4516A7FF64556A8A7E01) already exists. (DB Result Status:7 SQLState:23505)\"}
+{'date': '2026-04-13T13:04:20.794-05:00', 'msgType': 'store', 'hsm': {'instance': 'cta', 'provider': 'dcache-cta', 'type': 'cta'}, 'transferTime': 9, 'cellName': 'rw-stkendca61a-2', 'session': 'pool:rw-stkendca61a-2@rw-stkendca61a-2Domain:1776103460794-37454', 'version': '1.0', 'storageInfo': 'dune.dune@cta', 'cellType': 'pool', 'fileSize': 2967538045, 'queuingTime': 0, 'cellDomain': 'rw-stkendca61a-2Domain', 'locations': [], 'pnfsid': '0000082C1CA24ABE411FA957DCB563929441', 'transaction': 'pool:rw-stkendca61a-2@rw-stkendca61a-2Domain:1776103460794-37454', 'billingPath': '/pnfs/fnal.gov/usr/dune/tape_backed/dunepro/beam-data/040416/foo.root', 'status': {'msg': 'io.grpc.StatusRuntimeException: ABORTED: Storage class dune.dune@cta has no archive routes', 'code': 10011}}
+"""
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    stream=sys.stderr
+)
+logger = logging.getLogger(__name__)
+# Global locks
+print_lock = Lock()
+kinit_lock = Lock()
+CONFIG_FILE = os.getenv("DCACHE_CONFIG")   # None → auto-resolved via get_config_path()
+SSH_HOST = "fndca"
+SSH_PORT = 24223
+SSH_USER = "enstore"
+HOSTNAME = socket.getfqdn()
+_VO = "vo"
+_INSTANCE = "instance"
+def safe_json_deserializer(v):
+    try:
+        return json.loads(v.decode('utf-8')) if v else None
+    except (json.JSONDecodeError, UnicodeDecodeError) as e:
+        return None  # Or log the error
+class Worker(Process):
+    """Worker process to query CTA DB and process results."""
+    def __init__(
+        self,
+        queue: Queue,
+        config: Dict[str, Any]
+    ) -> None:
+        """Initialize the worker.
+        Args:
+            queue: Task queue
+            config: Configuration dictionary
+        """
+        super().__init__()
+        self.queue = queue
+        self.config = config
+    def run(self) -> None:
+        """Process tasks from the queue."""
+        cta_db = chimera_db = ssh = None
+        try:
+            cta_db = psycopg.create_connection(self.config["cta_db"])
+            chimera_db = psycopg.create_connection(self.config["chimera_db"])
+            ssh = paramiko.get_shell(
+                self.config["admin"].get("host", SSH_HOST),
+                self.config["admin"].get("port", SSH_PORT),
+                self.config["admin"].get("user", SSH_USER)
+            )
+            for pnfsid in iter(self.queue.get, None):
+                self._process_file(ssh, cta_db, chimera_db, pnfsid)
+        except Exception as exc:
+            error_string = traceback.format_exc()
+            #logger.error("Worker failed: %s", exc)
+            logger.error(f"Worker failed: {error_string}")
+        finally:
+            for conn in (ssh, cta_db, chimera_db):
+                if conn:
+                    try:
+                        conn.close()
+                    except Exception:
+                        pass
+    def _process_file(
+        self,
+        ssh,
+        cta_db: psycopg2.extensions.connection,
+        chimera_db: psycopg2.extensions.connection,
+        pnfsid: str
+    ) -> None:
+        """Process a single file.
+        Args:
+            ssh: SSH connection
+            cta_db: CTA database connection
+            chimera_db: Chimera database connection
+            pnfsid: PNFS ID to process
+            instance: CTA instance name
+        """
+        rows = psycopg.select(
+            cta_db,
+            "select af.disk_instance_name, "
+            "sc.storage_class_name, "
+            "'cta://cta/'||af.disk_file_id||'?archiveid='||af.archive_file_id as location "
+            "from archive_file af inner join storage_class sc on sc.storage_class_id = af.storage_class_id "
+            "where af.disk_file_id = %s and af.creation_time < %s",
+            (pnfsid, int(time.time()) - 6 * 3600)
+        )
+        if not rows:
+            with print_lock:
+                logger.error(f"Failed to find CTA location for {pnfsid}")
+            return
+        disk_instance_name = rows[0]["disk_instance_name"]
+        location = rows[0]["location"]
+        storage_class = rows[0]["storage_class_name"]
+        storage_group, file_family = storage_class.split("@")[0].split(".")
+        with print_lock:
+                logger.error(f"Processing {pnfsid},  {disk_instance_name}, {storage_class}")
+        result = psycopg.select (
+            chimera_db,
+            "select count(*) from t_locationinfo "
+            "where itype = 0 and inumber = "
+            "(select inumber from t_inodes where ipnfsid = %s)",
+            (pnfsid,)
+        )
+        if result[0]["count"] != 0:
+            with print_lock:
+                logger.error(
+                    f"File has location in chimera {pnfsid} {storage_class}, Skipping"
+                    )
+            return
+        psycopg.insert(
+            chimera_db,
+            "insert into t_storageinfo "
+            "(inumber, ihsmname, istoragegroup, istoragesubgroup) "
+            "values (pnfsid2inumber(%s), 'cta', %s, %s)",
+            (pnfsid, storage_group, file_family)
+        )
+        psycopg.insert(
+            chimera_db,
+            "insert into t_locationinfo "
+            "(inumber, itype, ipriority, ictime, iatime, istate, ilocation) "
+            "values (pnfsid2inumber(%s), 0, 10, now(), now(), 1, %s)",
+            (pnfsid, location)
+        )
+        admin.execute_admin_command(
+            ssh,
+            f"\\sl {pnfsid} rep set cached {pnfsid}"
+        )
+        admin.execute_admin_command(
+            ssh,
+            f"\\sl {pnfsid} st kill {pnfsid}"
+        )
+def main() -> None:
+    """Main function to process files that are already on tape."""
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description="Process files that are already on tape and update their status"
+    )
+    parser.add_argument(
+        "--cpu-count",
+        type=int,
+        default=10,
+        help="Number of worker processes to spawn"
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Enable verbose logging"
+    )
+    parser.add_argument(
+        "-i", "--instance",
+        help='instance',
+        default="public_prd",
+        metavar="INSTANCE"
+    )
+    args = parser.parse_args()
+    # Load configuration
+    try:
+        config_path = get_config_path()
+        if config_path.stat().st_mode != 33152:
+            logger.error(
+                "Config file %s permissions too permissive, should be 0600",
+                config_path
+            )
+            sys.exit(1)
+        config = yaml.safe_load(config_path.read_text())
+        if not config:
+            logger.error("Failed to load configuration from %s", config_path)
+            sys.exit(1)
+    except (OSError, IOError) as exc:
+        if isinstance(exc, FileNotFoundError):
+            logger.error("Config file %s does not exist", exc)
+        else:
+            logger.error("Error reading config file: %s", exc)
+        sys.exit(1)
+    except yaml.YAMLError as exc:
+        logger.error("Error parsing config file: %s", exc)
+        sys.exit(1)
+    logger.info("Starting processing")
+    # Start Kerberos ticket refresh worker
+    kinit_worker = kerberos.KinitWorker()
+    kinit_worker.start()
+    # Set up worker processes
+    queue: Queue = Queue(maxsize=100)
+    workers = [
+        Worker(queue, config)
+        for _ in range(args.cpu_count)
+    ]
+    for worker in workers:
+        worker.start()
+    consumer = KafkaConsumer(config["kafka"].get("topic", "ingest.logs.cta.taped"),
+                             group_id=config["kafka"].get("group", "dcache-ctananny-prd"),  # Required to resume
+                             bootstrap_servers=config["kafka"].get("bootstrap_servers", "lskafka:9092"),
+                             auto_offset_reset='latest',    # Fallback if no offset is found
+                             #auto_offset_reset='earliest',    # Fallback if no offset is found
+                             enable_auto_commit=True,           # Automatically save progress
+                             value_deserializer=lambda m:  safe_json_deserializer(m))
+                             #value_deserializer=lambda m: json.loads(m.decode("utf-8")))
+    try:
+        for msg in consumer:
+            if os.path.exists("/tmp/STOP"):
+                os.unlink("/tmp/STOP")
+                break
+            # Skip invalid bytes entirely
+            #message = msg.value.decode('utf-8', errors='ignore')
+            #print(message.keys())
+            # Replace invalid bytes with a placeholder ()
+            #decoded_value = message.value.decode('utf-8', errors='replace')
+            message = msg.value
+            if not message:
+                print(f"WHAT {msg}")
+                continue
+            message_string =  message["message"]
+            payload = message["cta"]
+            if _VO in payload and _INSTANCE in payload:
+                vo = payload.get(_VO)
+                instance = payload.get(_INSTANCE)
+                exception_message = payload.get("exceptionMessageValue")
+                if not exception_message:
+                    continue
+                logger.debug(f"Found exception {vo} {instance} {exception_message}")
+                if exception_message.find("duplicate key value violates unique constraint") != -1:
+                    try:
+                        tuple =  exception_message.split("=")[1].split("already")[0].strip()
+                        disk_instance, pnfsid = [i.strip() for i in re.sub("[()]", "", tuple).split(",")]
+                        if disk_instance == args.instance:
+                            logger.debug(f"Found {pnfsid} {disk_instance}, putting on the queue")
+                            queue.put(pnfsid)
+                    except Exception as e:
+                        logger.info(f"Caught exception {e}")
+                        pass
+    except KeyboardInterrupt:
+        logger.info("Interrupted successfully.")
+    finally:
+        for _ in range(args.cpu_count):
+            queue.put(None)
+        # Clean up
+        kinit_worker.stop()
+#    consumer = KafkaConsumer(config["kafka"].get("topic", "ingest.dcache.billing"),
+#                             group_id=config["kafka"].get("group", "dcache-ctananny-prd"),  # Required to resume
+#                             bootstrap_servers=config["kafka"].get("bootstrap_servers", "lskafka:9092"),
+#                             auto_offset_reset='latest',    # Fallback if no offset is found
+#                             #auto_offset_reset='earliest',    # Fallback if no offset is found
+#                             enable_auto_commit=True,           # Automatically save progress
+#                             value_deserializer=lambda m:  safe_json_deserializer(m))
+#                             #value_deserializer=lambda m: json.loads(m.decode("utf-8")))
+#
+#    try:
+#        for msg in consumer:
+#
+#            if os.path.exists("/tmp/STOP"):
+#                break
+#            # Skip invalid bytes entirely
+#            #message = msg.value.decode('utf-8', errors='ignore')
+#            #print(message.keys())
+#
+#            # Replace invalid bytes with a placeholder ()
+#            #decoded_value = message.value.decode('utf-8', errors='replace')
+#            message = msg.value
+#            if not message:
+#                print(f"WHAT {msg}")
+#                continue
+#            msgType = message["msgType"]
+#            if msgType != "store":
+#                continue
+#            error_message, error_code = message["status"].values()
+#
+#            if error_code == 0:
+#                continue
+#            storage_info = message["storageInfo"]
+#            pnfsid = message["pnfsid"].strip()
+#            print(f"{pnfsid} {storage_info} {error_message}")
+#            print(message)
+#            continue
+#            message_string =  message["message"]
+#            payload = message["cta"]
+#            if _VO in payload and _INSTANCE in payload:
+#                vo = payload.get(_VO)
+#                instance = payload.get(_INSTANCE)
+#                exception_message = payload.get("exceptionMessageValue")
+#                if not exception_message:
+#                    continue
+#                logger.debug(f"Found exception {vo} {instance} {exception_message}")
+#                if exception_message.find("duplicate key value violates unique constraint") != -1:
+#                    try:
+#                        tuple =  exception_message.split("=")[1].split("already")[0].strip()
+#                        disk_instance, pnfsid = [i.strip() for i in re.sub("[()]", "", tuple).split(",")]
+#
+#                        if disk_instance == args.instance:
+#                            logger.debug(f"Found {pnfsid} {disk_instance}, putting on the queue")
+#                            queue.put(pnfsid)
+#                    except Exception as e:
+#                        logger.info(f"Caught exception {e}")
+#                        pass
+#    except KeyboardInterrupt:
+#        logger.info("Interrupted successfully.")
+#    finally:
+#        for _ in range(args.cpu_count):
+#            queue.put(None)
+#        # Clean up
+#        kinit_worker.stop()
+if __name__ == "__main__":
+    main()

pydcache/scripts/install_service.py ADDED Viewed

@@ -0,0 +1,97 @@
+"""Install the cta-nanny systemd service unit file."""
+from __future__ import annotations
+import os
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+# Bundled unit file template
+_UNIT_TEMPLATE = Path(__file__).parent.parent / "config" / "cta-nanny.service"
+_SERVICE_NAME = "cta-nanny.service"
+def _find_cta_nanny_exec() -> str:
+    """Return the absolute path to the cta-nanny executable."""
+    path = shutil.which("cta-nanny")
+    if path:
+        return path
+    # Fallback: same bin/ dir as this Python interpreter
+    candidate = Path(sys.executable).parent / "cta-nanny"
+    if candidate.exists():
+        return str(candidate)
+    raise FileNotFoundError(
+        "Cannot locate the cta-nanny executable. "
+        "Make sure pydcache is installed and its bin/ directory is on PATH."
+    )
+def _system_unit_dir() -> Path:
+    return Path("/etc/systemd/system")
+def _user_unit_dir() -> Path:
+    xdg = Path(os.environ.get("XDG_CONFIG_HOME", Path.home() / ".config"))
+    return xdg / "systemd" / "user"
+def main() -> None:
+    """Install cta-nanny.service and print next steps."""
+    is_root = os.getuid() == 0
+    # Resolve real executable path and bake it into the unit file
+    try:
+        exec_path = _find_cta_nanny_exec()
+    except FileNotFoundError as exc:
+        print(f"ERROR: {exc}", file=sys.stderr)
+        sys.exit(1)
+    unit_text = _UNIT_TEMPLATE.read_text()
+    unit_text = unit_text.replace("CTA_NANNY_EXEC", exec_path)
+    if is_root:
+        dest_dir = _system_unit_dir()
+        systemctl_scope = []          # system scope (default)
+        enable_cmd = ["systemctl", "enable", "--now", _SERVICE_NAME]
+    else:
+        dest_dir = _user_unit_dir()
+        systemctl_scope = ["--user"]
+        enable_cmd = ["systemctl", "--user", "enable", "--now", _SERVICE_NAME]
+    dest_dir.mkdir(parents=True, exist_ok=True)
+    dest = dest_dir / _SERVICE_NAME
+    dest.write_text(unit_text)
+    print(f"Installed unit file → {dest}")
+    if is_root:
+        # Reload systemd and (optionally) enable the service
+        subprocess.run(["systemctl", "daemon-reload"], check=True)
+        print("\nUnit file installed. Review it before enabling:\n")
+        print(f"  nano {dest}\n")
+        print("Then enable and start the service:")
+        print(f"  systemctl enable --now {_SERVICE_NAME}")
+        print(f"\nUseful commands:")
+        print(f"  systemctl status  {_SERVICE_NAME}")
+        print(f"  systemctl restart {_SERVICE_NAME}")
+        print(f"  journalctl -u {_SERVICE_NAME} -f")
+    else:
+        subprocess.run(["systemctl", "--user", "daemon-reload"], check=True)
+        print("\nUnit file installed (user scope). Review it before enabling:\n")
+        print(f"  nano {dest}\n")
+        print("Then enable and start the service:")
+        print(f"  systemctl --user enable --now {_SERVICE_NAME}")
+        print(f"\nUseful commands:")
+        print(f"  systemctl --user status  {_SERVICE_NAME}")
+        print(f"  systemctl --user restart {_SERVICE_NAME}")
+        print(f"  journalctl --user -u {_SERVICE_NAME} -f")
+        print(
+            "\nNOTE: for a user service to survive logout, run once as root:\n"
+            f"  loginctl enable-linger {os.environ.get('USER', 'youruser')}"
+        )
+if __name__ == "__main__":
+    main()

pydcache/util/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""pydcache – collection of dCache Python utilities."""
+__version__ = "0.1.0"