npcpy 1.2.34__py3-none-any.whl → 1.2.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
npcpy/gen/ocr.py ADDED
@@ -0,0 +1,187 @@
1
+ """
2
+ Utilities for running DeepSeek OCR (via Unsloth) to turn images into text.
3
+
4
+ This is intentionally lightweight: the model is only downloaded/loaded when
5
+ `DeepSeekOCR.run` is called. You can point `model_id` at a local path or a
6
+ Hugging Face repo ID; we default to the public `unsloth/DeepSeek-OCR`.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import os
12
+ import tempfile
13
+ from dataclasses import dataclass
14
+ from typing import Optional, Union
15
+
16
+ try:
17
+ from PIL import Image
18
+ except ImportError:
19
+ Image = None # Delayed import for lightweight environments
20
+
21
+ ImageInput = Union[str, bytes, "Image.Image"]
22
+
23
+
24
+ @dataclass
25
+ class DeepSeekOCR:
26
+ """Lazy loader/wrapper around the Unsloth DeepSeek OCR vision model."""
27
+
28
+ model_id: str = "unsloth/DeepSeek-OCR"
29
+ local_dir: str = os.path.expanduser("~/.npcsh/models/deepseek_ocr")
30
+ load_in_4bit: bool = False
31
+ base_size: int = 1024
32
+ image_size: int = 640
33
+ crop_mode: bool = True
34
+
35
+ def __post_init__(self) -> None:
36
+ self._model = None
37
+ self._tokenizer = None
38
+
39
+ def _ensure_weights(self) -> str:
40
+ """Download weights if they are not already on-disk."""
41
+ if os.path.isdir(self.local_dir) and os.listdir(self.local_dir):
42
+ return self.local_dir
43
+
44
+ os.makedirs(self.local_dir, exist_ok=True)
45
+ try:
46
+ from huggingface_hub import snapshot_download
47
+ except ImportError as exc:
48
+ raise ImportError(
49
+ "huggingface_hub is required to download DeepSeek OCR weights. "
50
+ "Install with `pip install huggingface_hub` or pre-download manually."
51
+ ) from exc
52
+
53
+ snapshot_download(self.model_id, local_dir=self.local_dir)
54
+ return self.local_dir
55
+
56
+ def _load_model(self) -> None:
57
+ """Load the Unsloth vision model once (lazy)."""
58
+ if self._model is not None and self._tokenizer is not None:
59
+ return
60
+
61
+ weights_dir = self._ensure_weights()
62
+ os.environ.setdefault("UNSLOTH_WARN_UNINITIALIZED", "0")
63
+
64
+ try:
65
+ from unsloth import FastVisionModel
66
+ from transformers import AutoModel
67
+ except ImportError as exc:
68
+ raise ImportError(
69
+ "unsloth and transformers are required to run DeepSeek OCR. "
70
+ "Install with `pip install unsloth transformers` (and bitsandbytes if using 4bit)."
71
+ ) from exc
72
+
73
+ self._model, self._tokenizer = FastVisionModel.from_pretrained(
74
+ weights_dir,
75
+ load_in_4bit=self.load_in_4bit,
76
+ auto_model=AutoModel,
77
+ trust_remote_code=True,
78
+ unsloth_force_compile=True,
79
+ use_gradient_checkpointing="unsloth",
80
+ )
81
+
82
+ def _prepare_image_file(self, image: ImageInput) -> tuple[str, bool]:
83
+ """Normalize various image inputs to a file path and say if we should clean it up."""
84
+ if isinstance(image, str):
85
+ if not os.path.exists(image):
86
+ raise FileNotFoundError(f"Image path does not exist: {image}")
87
+ return image, False
88
+
89
+ if Image is None:
90
+ raise ImportError("Pillow is required for OCR image handling. Install with `pip install pillow`.")
91
+
92
+ if isinstance(image, bytes):
93
+ import io
94
+
95
+ pil = Image.open(io.BytesIO(image)).convert("RGB")
96
+ elif isinstance(image, Image.Image):
97
+ pil = image.convert("RGB")
98
+ else:
99
+ raise TypeError(f"Unsupported image input type: {type(image)}")
100
+
101
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
102
+ pil.save(tmp, format="PNG")
103
+ tmp.close()
104
+ return tmp.name, True
105
+
106
+ def run(
107
+ self,
108
+ image: ImageInput,
109
+ prompt: str = "<image>\nFree OCR. ",
110
+ output_path: Optional[str] = None,
111
+ save_results: bool = False,
112
+ test_compress: bool = False,
113
+ **kwargs,
114
+ ) -> str:
115
+ """
116
+ Run OCR on an image and return the recognized text.
117
+
118
+ Args:
119
+ image: Path, bytes, or PIL Image.
120
+ prompt: Prompt passed to the vision model (keeps the default used
121
+ in the reference notebook).
122
+ output_path: Optional directory for saving debug outputs.
123
+ save_results: If True, Unsloth will save visualization artifacts.
124
+ test_compress: Forwarded to `model.infer`.
125
+ kwargs: Additional overrides for infer (base_size, image_size, etc).
126
+ """
127
+ self._load_model()
128
+
129
+ image_file, should_cleanup = self._prepare_image_file(image)
130
+ infer_kwargs = {
131
+ "prompt": prompt,
132
+ "image_file": image_file,
133
+ "output_path": output_path or "",
134
+ "base_size": kwargs.pop("base_size", self.base_size),
135
+ "image_size": kwargs.pop("image_size", self.image_size),
136
+ "crop_mode": kwargs.pop("crop_mode", self.crop_mode),
137
+ "save_results": save_results,
138
+ "test_compress": test_compress,
139
+ }
140
+
141
+ try:
142
+ result = self._model.infer(self._tokenizer, **infer_kwargs)
143
+ finally:
144
+ # Clean up temp files created from bytes/PIL inputs.
145
+ if should_cleanup and os.path.exists(image_file):
146
+ try:
147
+ os.remove(image_file)
148
+ except OSError:
149
+ pass
150
+
151
+ # Unsloth infer returns a dict-like object; stringify for callers.
152
+ if isinstance(result, str):
153
+ return result.strip()
154
+ if isinstance(result, dict) and "text" in result:
155
+ return str(result["text"]).strip()
156
+ return str(result).strip()
157
+
158
+
159
+ def deepseek_ocr(
160
+ image: ImageInput,
161
+ prompt: str = "<image>\nFree OCR. ",
162
+ model_id: str = "unsloth/DeepSeek-OCR",
163
+ local_dir: Optional[str] = None,
164
+ **kwargs,
165
+ ) -> str:
166
+ """
167
+ Functional wrapper that mirrors the reference notebook defaults.
168
+
169
+ Example:
170
+ text = deepseek_ocr(\"invoice.png\")
171
+ """
172
+ runner = DeepSeekOCR(
173
+ model_id=model_id,
174
+ local_dir=local_dir or os.path.expanduser("~/.npcsh/models/deepseek_ocr"),
175
+ load_in_4bit=kwargs.pop("load_in_4bit", False),
176
+ base_size=kwargs.pop("base_size", 1024),
177
+ image_size=kwargs.pop("image_size", 640),
178
+ crop_mode=kwargs.pop("crop_mode", True),
179
+ )
180
+ return runner.run(
181
+ image=image,
182
+ prompt=prompt,
183
+ output_path=kwargs.pop("output_path", None),
184
+ save_results=kwargs.pop("save_results", False),
185
+ test_compress=kwargs.pop("test_compress", False),
186
+ **kwargs,
187
+ )
@@ -405,9 +405,13 @@ def save_kg_to_db(engine: Engine, kg_data: Dict[str, Any], team_name: str, npc_n
405
405
  def generate_message_id() -> str:
406
406
  return str(uuid.uuid4())
407
407
 
408
+
409
+
410
+ from sqlalchemy import event, Table, Column, Integer, String, Text
411
+ from sqlalchemy.orm import mapper
412
+
408
413
  class CommandHistory:
409
414
  def __init__(self, db: Union[str, Engine] = "~/npcsh_history.db"):
410
-
411
415
  if isinstance(db, str):
412
416
  self.engine = create_engine_from_path(db)
413
417
  self.db_path = db
@@ -415,15 +419,54 @@ class CommandHistory:
415
419
  self.engine = db
416
420
  self.db_path = str(db.url)
417
421
  else:
418
- raise TypeError(f"Unsupported type for CommandHistory db parameter: {type(db)}")
422
+ raise TypeError(f"Unsupported type: {type(db)}")
419
423
 
420
424
  self._initialize_schema()
421
-
425
+ self._setup_execution_triggers()
426
+ self.backfill_execution_tables()
427
+ def backfill_execution_tables(self):
428
+ with self.engine.begin() as conn:
429
+ conn.execute(text("""
430
+ INSERT OR IGNORE INTO jinx_executions
431
+ (message_id, jinx_name, input, timestamp, npc, team,
432
+ conversation_id)
433
+ SELECT
434
+ message_id,
435
+ SUBSTR(content, 2,
436
+ CASE
437
+ WHEN INSTR(SUBSTR(content, 2), ' ') > 0
438
+ THEN INSTR(SUBSTR(content, 2), ' ') - 1
439
+ ELSE LENGTH(content) - 1
440
+ END
441
+ ),
442
+ content,
443
+ timestamp,
444
+ npc,
445
+ team,
446
+ conversation_id
447
+ FROM conversation_history
448
+ WHERE role = 'user' AND content LIKE '/%'
449
+ """))
450
+
451
+ conn.execute(text("""
452
+ INSERT OR IGNORE INTO npc_executions
453
+ (message_id, input, timestamp, npc, team, conversation_id,
454
+ model, provider)
455
+ SELECT
456
+ message_id,
457
+ content,
458
+ timestamp,
459
+ npc,
460
+ team,
461
+ conversation_id,
462
+ model,
463
+ provider
464
+ FROM conversation_history
465
+ WHERE role = 'user' AND npc IS NOT NULL
466
+ """))
422
467
  def _initialize_schema(self):
423
- """Creates all necessary tables."""
424
468
  metadata = MetaData()
425
469
 
426
-
427
470
  Table('command_history', metadata,
428
471
  Column('id', Integer, primary_key=True, autoincrement=True),
429
472
  Column('timestamp', String(50)),
@@ -433,7 +476,6 @@ class CommandHistory:
433
476
  Column('location', Text)
434
477
  )
435
478
 
436
-
437
479
  Table('conversation_history', metadata,
438
480
  Column('id', Integer, primary_key=True, autoincrement=True),
439
481
  Column('message_id', String(50), unique=True, nullable=False),
@@ -448,33 +490,48 @@ class CommandHistory:
448
490
  Column('team', String(100))
449
491
  )
450
492
 
451
-
452
493
  Table('message_attachments', metadata,
453
494
  Column('id', Integer, primary_key=True, autoincrement=True),
454
- Column('message_id', String(50), ForeignKey('conversation_history.message_id', ondelete='CASCADE'), nullable=False),
495
+ Column('message_id', String(50),
496
+ ForeignKey('conversation_history.message_id',
497
+ ondelete='CASCADE'),
498
+ nullable=False),
455
499
  Column('attachment_name', String(255)),
456
500
  Column('attachment_type', String(100)),
457
501
  Column('attachment_data', LargeBinary),
458
502
  Column('attachment_size', Integer),
459
503
  Column('upload_timestamp', String(50)),
460
- Column('file_path', Text)
504
+ Column('file_path', Text)
505
+ )
506
+
507
+ Table('labels', metadata,
508
+ Column('id', Integer, primary_key=True, autoincrement=True),
509
+ Column('entity_type', String(50), nullable=False),
510
+ Column('entity_id', String(100), nullable=False),
511
+ Column('label', String(100), nullable=False),
512
+ Column('metadata', Text),
513
+ Column('created_at', DateTime, default=func.now())
514
+ )
515
+
516
+ Table('jinx_executions', metadata,
517
+ Column('message_id', String(50), primary_key=True),
518
+ Column('jinx_name', String(100)),
519
+ Column('input', Text),
520
+ Column('timestamp', String(50)),
521
+ Column('npc', String(100)),
522
+ Column('team', String(100)),
523
+ Column('conversation_id', String(100))
461
524
  )
462
525
 
463
-
464
- Table('jinx_execution_log', metadata,
465
- Column('execution_id', Integer, primary_key=True, autoincrement=True),
466
- Column('triggering_message_id', String(50), ForeignKey('conversation_history.message_id', ondelete='CASCADE'), nullable=False),
467
- Column('response_message_id', String(50), ForeignKey('conversation_history.message_id', ondelete='SET NULL')),
468
- Column('conversation_id', String(100), nullable=False),
469
- Column('timestamp', String(50), nullable=False),
470
- Column('npc_name', String(100)),
471
- Column('team_name', String(100)),
472
- Column('jinx_name', String(100), nullable=False),
473
- Column('jinx_inputs', Text),
474
- Column('jinx_output', Text),
475
- Column('status', String(50), nullable=False),
476
- Column('error_message', Text),
477
- Column('duration_ms', Integer)
526
+ Table('npc_executions', metadata,
527
+ Column('message_id', String(50), primary_key=True),
528
+ Column('input', Text),
529
+ Column('timestamp', String(50)),
530
+ Column('npc', String(100)),
531
+ Column('team', String(100)),
532
+ Column('conversation_id', String(100)),
533
+ Column('model', String(100)),
534
+ Column('provider', String(100))
478
535
  )
479
536
 
480
537
  Table('memory_lifecycle', metadata,
@@ -492,30 +549,137 @@ class CommandHistory:
492
549
  Column('provider', String(100)),
493
550
  Column('created_at', DateTime, default=func.now())
494
551
  )
495
-
496
552
 
497
553
  metadata.create_all(self.engine, checkfirst=True)
554
+ init_kg_schema(self.engine)
555
+
556
+ def _setup_execution_triggers(self):
557
+ if 'sqlite' in str(self.engine.url):
558
+ with self.engine.begin() as conn:
559
+ conn.execute(text("""
560
+ CREATE TRIGGER IF NOT EXISTS populate_jinx_executions
561
+ AFTER INSERT ON conversation_history
562
+ WHEN NEW.role = 'user' AND NEW.content LIKE '/%'
563
+ BEGIN
564
+ INSERT OR IGNORE INTO jinx_executions
565
+ (message_id, jinx_name, input, timestamp, npc, team,
566
+ conversation_id)
567
+ VALUES (
568
+ NEW.message_id,
569
+ SUBSTR(NEW.content, 2,
570
+ CASE
571
+ WHEN INSTR(SUBSTR(NEW.content, 2), ' ') > 0
572
+ THEN INSTR(SUBSTR(NEW.content, 2), ' ') - 1
573
+ ELSE LENGTH(NEW.content) - 1
574
+ END
575
+ ),
576
+ NEW.content,
577
+ NEW.timestamp,
578
+ NEW.npc,
579
+ NEW.team,
580
+ NEW.conversation_id
581
+ );
582
+ END
583
+ """))
584
+
585
+ conn.execute(text("""
586
+ CREATE TRIGGER IF NOT EXISTS populate_npc_executions
587
+ AFTER INSERT ON conversation_history
588
+ WHEN NEW.role = 'user' AND NEW.npc IS NOT NULL
589
+ BEGIN
590
+ INSERT OR IGNORE INTO npc_executions
591
+ (message_id, input, timestamp, npc, team,
592
+ conversation_id, model, provider)
593
+ VALUES (
594
+ NEW.message_id,
595
+ NEW.content,
596
+ NEW.timestamp,
597
+ NEW.npc,
598
+ NEW.team,
599
+ NEW.conversation_id,
600
+ NEW.model,
601
+ NEW.provider
602
+ );
603
+ END
604
+ """))
605
+
606
+ def get_jinx_executions(self, jinx_name: str = None, limit: int = 1000) -> List[Dict]:
607
+ if jinx_name:
608
+ stmt = """
609
+ SELECT je.*, l.label
610
+ FROM jinx_executions je
611
+ LEFT JOIN labels l ON l.entity_type = 'message'
612
+ AND l.entity_id = je.message_id
613
+ WHERE je.jinx_name = :jinx_name
614
+ ORDER BY je.timestamp DESC
615
+ LIMIT :limit
616
+ """
617
+ return self._fetch_all(stmt, {"jinx_name": jinx_name, "limit": limit})
498
618
 
619
+ stmt = """
620
+ SELECT je.*, l.label
621
+ FROM jinx_executions je
622
+ LEFT JOIN labels l ON l.entity_type = 'message'
623
+ AND l.entity_id = je.message_id
624
+ ORDER BY je.timestamp DESC
625
+ LIMIT :limit
626
+ """
627
+ return self._fetch_all(stmt, {"limit": limit})
628
+
629
+ def get_npc_executions(self, npc_name: str, limit: int = 1000) -> List[Dict]:
630
+ stmt = """
631
+ SELECT ne.*, l.label
632
+ FROM npc_executions ne
633
+ LEFT JOIN labels l ON l.entity_type = 'message'
634
+ AND l.entity_id = ne.message_id
635
+ WHERE ne.npc = :npc_name
636
+ ORDER BY ne.timestamp DESC
637
+ LIMIT :limit
638
+ """
639
+ return self._fetch_all(stmt, {"npc_name": npc_name, "limit": limit})
640
+
641
+ def label_execution(self, message_id: str, label: str):
642
+ self.add_label('message', message_id, label)
499
643
 
644
+ def add_label(self, entity_type: str, entity_id: str, label: str, metadata: dict = None):
645
+ stmt = """
646
+ INSERT INTO labels (entity_type, entity_id, label, metadata)
647
+ VALUES (:entity_type, :entity_id, :label, :metadata)
648
+ """
500
649
  with self.engine.begin() as conn:
501
-
502
- index_queries = [
503
- "CREATE INDEX IF NOT EXISTS idx_jinx_log_trigger_msg ON jinx_execution_log (triggering_message_id)",
504
- "CREATE INDEX IF NOT EXISTS idx_jinx_log_convo_id ON jinx_execution_log (conversation_id)",
505
- "CREATE INDEX IF NOT EXISTS idx_jinx_log_jinx_name ON jinx_execution_log (jinx_name)",
506
- "CREATE INDEX IF NOT EXISTS idx_jinx_log_timestamp ON jinx_execution_log (timestamp)"
507
- ]
508
-
509
- for idx_query in index_queries:
510
- try:
511
- conn.execute(text(idx_query))
512
- except SQLAlchemyError:
513
-
514
- pass
650
+ conn.execute(text(stmt), {
651
+ "entity_type": entity_type,
652
+ "entity_id": entity_id,
653
+ "label": label,
654
+ "metadata": json.dumps(metadata) if metadata else None
655
+ })
656
+
657
+ def get_labels(self, entity_type: str = None, label: str = None) -> List[Dict]:
658
+ conditions = []
659
+ params = {}
515
660
 
661
+ if entity_type:
662
+ conditions.append("entity_type = :entity_type")
663
+ params["entity_type"] = entity_type
664
+ if label:
665
+ conditions.append("label = :label")
666
+ params["label"] = label
516
667
 
517
- init_kg_schema(self.engine)
668
+ where = f"WHERE {' AND '.join(conditions)}" if conditions else ""
669
+ stmt = f"SELECT * FROM labels {where} ORDER BY created_at DESC"
670
+
671
+ return self._fetch_all(stmt, params)
518
672
 
673
+ def get_training_data_by_label(self, label: str = 'training') -> List[Dict]:
674
+ stmt = """
675
+ SELECT l.entity_type, l.entity_id, l.metadata,
676
+ ch.content, ch.role, ch.npc, ch.conversation_id
677
+ FROM labels l
678
+ LEFT JOIN conversation_history ch ON
679
+ (l.entity_type = 'message' AND l.entity_id = ch.message_id)
680
+ WHERE l.label = :label
681
+ """
682
+ return self._fetch_all(stmt, {"label": label})
519
683
  def _execute_returning_id(self, stmt: str, params: Dict = None) -> Optional[int]:
520
684
  """Execute INSERT and return the generated ID"""
521
685
  with self.engine.begin() as conn:
@@ -535,6 +699,7 @@ class CommandHistory:
535
699
  result = conn.execute(text(stmt), params or {})
536
700
  return [dict(row._mapping) for row in result]
537
701
 
702
+
538
703
  def add_command(self, command, subcommands, output, location):
539
704
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
540
705
  stmt = """
@@ -1092,6 +1257,32 @@ def start_new_conversation(prepend: str = None) -> str:
1092
1257
  prepend = 'npcsh'
1093
1258
  return f"{prepend}_{datetime.now().strftime('%Y%m%d%H%M%S')}"
1094
1259
 
1260
+
1261
+ def format_memory_context(memory_examples):
1262
+ if not memory_examples:
1263
+ return ""
1264
+
1265
+ context_parts = []
1266
+
1267
+ approved_examples = memory_examples.get("approved", [])
1268
+ rejected_examples = memory_examples.get("rejected", [])
1269
+
1270
+ if approved_examples:
1271
+ context_parts.append("EXAMPLES OF GOOD MEMORIES:")
1272
+ for ex in approved_examples[:5]:
1273
+ final = ex.get("final_memory") or ex.get("initial_memory")
1274
+ context_parts.append(f"- {final}")
1275
+
1276
+ if rejected_examples:
1277
+ context_parts.append("\nEXAMPLES OF POOR MEMORIES TO AVOID:")
1278
+ for ex in rejected_examples[:3]:
1279
+ context_parts.append(f"- {ex.get('initial_memory')}")
1280
+
1281
+ if context_parts:
1282
+ context_parts.append("\nLearn from these examples to generate similar high-quality memories.")
1283
+ return "\n".join(context_parts)
1284
+
1285
+ return ""
1095
1286
  def save_conversation_message(
1096
1287
  command_history: CommandHistory,
1097
1288
  conversation_id: str,