mega-brain-ai 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mega-brain-ai might be problematic. Click here for more details.
- package/.claude/CLAUDE.md +155 -0
- package/.claude/commands/agents.md +161 -0
- package/.claude/commands/ask.md +117 -0
- package/.claude/commands/benchmark.md +224 -0
- package/.claude/commands/chat.md +343 -0
- package/.claude/commands/compare.md +116 -0
- package/.claude/commands/conclave.md +194 -0
- package/.claude/commands/config.md +133 -0
- package/.claude/commands/council.md +194 -0
- package/.claude/commands/create-agent.md +452 -0
- package/.claude/commands/debate.md +157 -0
- package/.claude/commands/documentation/create-architecture-documentation.md +175 -0
- package/.claude/commands/dossiers.md +180 -0
- package/.claude/commands/evolve.md +223 -0
- package/.claude/commands/extract-dna.md +170 -0
- package/.claude/commands/extract-knowledge.md +507 -0
- package/.claude/commands/inbox.md +296 -0
- package/.claude/commands/ingest-empresa.md +191 -0
- package/.claude/commands/ingest.md +182 -0
- package/.claude/commands/jarvis-briefing.md +67 -0
- package/.claude/commands/jarvis-control.md +169 -0
- package/.claude/commands/jarvis-full.md +181 -0
- package/.claude/commands/jarvis.md +212 -0
- package/.claude/commands/ler-drive.md +212 -0
- package/.claude/commands/log.md +158 -0
- package/.claude/commands/loop.md +133 -0
- package/.claude/commands/loops.md +73 -0
- package/.claude/commands/mission-autopilot.md +538 -0
- package/.claude/commands/mission.md +353 -0
- package/.claude/commands/process-inbox.md +148 -0
- package/.claude/commands/process-jarvis.md +3036 -0
- package/.claude/commands/process-video.md +131 -0
- package/.claude/commands/rag-search.md +78 -0
- package/.claude/commands/resume.md +33 -0
- package/.claude/commands/save.md +38 -0
- package/.claude/commands/scan-inbox.md +125 -0
- package/.claude/commands/setup.md +99 -0
- package/.claude/commands/system-digest.md +243 -0
- package/.claude/commands/verify.md +182 -0
- package/.claude/commands/view-dna.md +169 -0
- package/.claude/hooks/agent_doctor.py +433 -0
- package/.claude/hooks/agent_memory_persister.py +203 -0
- package/.claude/hooks/auto_formatter.py +158 -0
- package/.claude/hooks/checkpoint_writer.py +244 -0
- package/.claude/hooks/claude_md_guard.py +146 -0
- package/.claude/hooks/creation_validator.py +357 -0
- package/.claude/hooks/enforce_dual_location.py +501 -0
- package/.claude/hooks/enforce_plan_mode.py +220 -0
- package/.claude/hooks/inbox_age_alert.py +367 -0
- package/.claude/hooks/jarvis_briefing.py +506 -0
- package/.claude/hooks/ledger_updater.py +301 -0
- package/.claude/hooks/memory_hints_injector.py +251 -0
- package/.claude/hooks/memory_updater.py +202 -0
- package/.claude/hooks/multi_agent_hook.py +464 -0
- package/.claude/hooks/notification_system.py +120 -0
- package/.claude/hooks/pattern_analyzer.py +526 -0
- package/.claude/hooks/pending_tracker.py +188 -0
- package/.claude/hooks/post_batch_cascading.py +1740 -0
- package/.claude/hooks/post_output_validator.py +358 -0
- package/.claude/hooks/post_tool_use.py +120 -0
- package/.claude/hooks/post_write_validator.py +200 -0
- package/.claude/hooks/quality_watchdog.py +394 -0
- package/.claude/hooks/ralph_wiggum.py +277 -0
- package/.claude/hooks/session-source-sync.py +218 -0
- package/.claude/hooks/session_autosave_v2.py +1135 -0
- package/.claude/hooks/session_end.py +203 -0
- package/.claude/hooks/session_start.py +939 -0
- package/.claude/hooks/skill_indexer.py +48 -0
- package/.claude/hooks/skill_router.py +358 -0
- package/.claude/hooks/stop_hook_completeness.py +178 -0
- package/.claude/hooks/subagent_tracker.py +163 -0
- package/.claude/hooks/token_checkpoint.py +584 -0
- package/.claude/hooks/user_prompt_submit.py +125 -0
- package/.claude/rules/ANTHROPIC-STANDARDS.md +384 -0
- package/.claude/rules/CLAUDE-LITE.md +201 -0
- package/.claude/rules/RULE-GROUP-1.md +320 -0
- package/.claude/rules/RULE-GROUP-2.md +307 -0
- package/.claude/rules/RULE-GROUP-3.md +248 -0
- package/.claude/rules/RULE-GROUP-4.md +427 -0
- package/.claude/rules/RULE-GROUP-5.md +388 -0
- package/.claude/rules/RULE-GROUP-6.md +387 -0
- package/.claude/rules/logging.md +53 -0
- package/.claude/rules/mcp-governance.md +128 -0
- package/.claude/rules/pipeline.md +60 -0
- package/.claude/rules/state-management.md +93 -0
- package/.claude/scripts/apply-tags.py +77 -0
- package/.claude/scripts/batch-extract-transcriptions.py +132 -0
- package/.claude/scripts/build-complete-index.py +250 -0
- package/.claude/scripts/build-planilha-index.py +170 -0
- package/.claude/scripts/complete-tag-matching.py +250 -0
- package/.claude/scripts/deduplicate-inbox.py +139 -0
- package/.claude/scripts/docx-xml-extractor.py +141 -0
- package/.claude/scripts/extract-docx-text.py +58 -0
- package/.claude/scripts/extract-single-transcription.py +74 -0
- package/.claude/scripts/extract_docx_from_gdrive.py +77 -0
- package/.claude/scripts/organized-downloader.py +246 -0
- package/.claude/scripts/planilha-tagger.py +187 -0
- package/.claude/scripts/revert-tags.py +70 -0
- package/.claude/scripts/source-sync.py +265 -0
- package/.claude/scripts/tag-inbox-files.py +276 -0
- package/.claude/scripts/tag-inbox-v2.py +253 -0
- package/.claude/scripts/test-extraction.py +35 -0
- package/.claude/scripts/test-full-extraction.py +74 -0
- package/.claude/skills/00-SKILL-CREATOR/SKILL.md +186 -0
- package/.claude/skills/01-SKILL-DOCS-MEGABRAIN/SKILL.md +251 -0
- package/.claude/skills/02-SKILL-PYTHON-MEGABRAIN/SKILL.md +323 -0
- package/.claude/skills/03-SKILL-AGENT-CREATION/SKILL.md +374 -0
- package/.claude/skills/04-SKILL-KNOWLEDGE-EXTRACTION/SKILL.md +318 -0
- package/.claude/skills/05-SKILL-PIPELINE-JARVIS/SKILL.md +430 -0
- package/.claude/skills/06-SKILL-BRAINSTORMING/SKILL.md +72 -0
- package/.claude/skills/07-SKILL-DISPATCHING-PARALLEL-AGENTS/SKILL.md +193 -0
- package/.claude/skills/08-SKILL-EXECUTING-PLANS/SKILL.md +114 -0
- package/.claude/skills/09-SKILL-WRITING-PLANS/SKILL.md +184 -0
- package/.claude/skills/10-SKILL-VERIFICATION-BEFORE-COMPLETION/SKILL.md +130 -0
- package/.claude/skills/11-SKILL-USING-SUPERPOWERS/SKILL.md +105 -0
- package/.claude/skills/DETECTION-PROTOCOL.md +217 -0
- package/.claude/skills/README.md +240 -0
- package/.claude/skills/SKILL-REGISTRY.md +284 -0
- package/.claude/skills/SKILL-SUGGESTIONS.md +114 -0
- package/.claude/skills/_TEMPLATES/SKILL-WRITER-GUIDE.md +385 -0
- package/.claude/skills/chronicler/SKILL.md +146 -0
- package/.claude/skills/chronicler/chronicler_core.py +468 -0
- package/.claude/skills/code-review/SKILL.md +160 -0
- package/.claude/skills/council/SKILL.md +210 -0
- package/.claude/skills/executor/SKILL.md +161 -0
- package/.claude/skills/fase-2-5-tagging/SKILL.md +182 -0
- package/.claude/skills/feature-dev/SKILL.md +154 -0
- package/.claude/skills/finance-agent/SKILL.md +137 -0
- package/.claude/skills/frontend-design/SKILL.md +165 -0
- package/.claude/skills/gdrive-transcription-downloader/SKILL.md +249 -0
- package/.claude/skills/gemini-fallback/SKILL.md +67 -0
- package/.claude/skills/gemini-fallback/gemini_fetch.py +0 -0
- package/.claude/skills/gha/SKILL.md +96 -0
- package/.claude/skills/gha/gha_diagnostic.py +227 -0
- package/.claude/skills/github-workflow/SKILL.md +190 -0
- package/.claude/skills/hookify/SKILL.md +134 -0
- package/.claude/skills/hybrid-source-reading/SKILL.md +265 -0
- package/.claude/skills/jarvis/SKILL.md +546 -0
- package/.claude/skills/jarvis-briefing/SKILL.md +340 -0
- package/.claude/skills/ler-planilha/SKILL.md +281 -0
- package/.claude/skills/plugin-dev/SKILL.md +176 -0
- package/.claude/skills/pr-review-toolkit/SKILL.md +178 -0
- package/.claude/skills/resume/SKILL.md +61 -0
- package/.claude/skills/save/SKILL.md +87 -0
- package/.claude/skills/skill-writer/SKILL.md +153 -0
- package/.claude/skills/skill-writer/examples.md +191 -0
- package/.claude/skills/skill-writer/troubleshooting.md +205 -0
- package/.claude/skills/smart-download-tagger/SKILL.md +148 -0
- package/.claude/skills/source-sync/SKILL.md +240 -0
- package/.claude/skills/sync-docs/SKILL.md +193 -0
- package/.claude/skills/sync-docs/config.json +37 -0
- package/.claude/skills/sync-docs/gdrive_sync.py +358 -0
- package/.claude/skills/sync-docs/reauth.py +71 -0
- package/.claude/skills/talent-agent/SKILL.md +183 -0
- package/.claude/skills/verify/SKILL.md +154 -0
- package/.claude/skills/verify/verify_runner.py +0 -0
- package/.claude/skills/verify-6-levels/SKILL.md +234 -0
- package/.claude/templates/BATCH-LOG-TEMPLATE.md +221 -0
- package/.claudeignore +9 -0
- package/.gitattributes +4 -0
- package/.github/layer1-allowlist.txt +80 -0
- package/.github/layer2-manifest.txt +40 -0
- package/.gitignore +219 -0
- package/README.md +1210 -0
- package/agents/_templates/INDEX.md +741 -0
- package/agents/_templates/TEMPLATE-AGENT-MD-ULTRA-ROBUSTO-V3.md +2399 -0
- package/agents/boardroom/CHECKLIST-MASTER.md +281 -0
- package/agents/boardroom/INTEGRATION-GUIDE.md +406 -0
- package/agents/boardroom/README.md +238 -0
- package/agents/boardroom/config/BOARDROOM-CONFIG.md +186 -0
- package/agents/boardroom/config/TTS-INTEGRATION.md +258 -0
- package/agents/boardroom/config/VOICE-PROFILES.md +624 -0
- package/agents/boardroom/config/voice_mapping.json +128 -0
- package/agents/boardroom/scripts/audio_generator.py +375 -0
- package/agents/boardroom/scripts/audio_generator_edge.py +353 -0
- package/agents/boardroom/scripts/jarvis_boardroom_hook.py +415 -0
- package/agents/boardroom/scripts/notebooklm_generator.py +578 -0
- package/agents/boardroom/templates/EPISODE-TEMPLATE.md +367 -0
- package/agents/boardroom/templates/scene-templates/SCENE-AGENT-DEBATE.md +252 -0
- package/agents/boardroom/templates/scene-templates/SCENE-COUNCIL.md +270 -0
- package/agents/boardroom/templates/scene-templates/SCENE-DNA-CONSULTATION.md +126 -0
- package/agents/boardroom/templates/scene-templates/SCENE-QUESTION.md +174 -0
- package/agents/boardroom/workflows/WORKFLOW-AUDIO-GENERATION.md +421 -0
- package/agents/constitution/BASE-CONSTITUTION.md +254 -0
- package/agents/council/CRITIC.md +197 -0
- package/agents/council/DEVILS-ADVOCATE.md +274 -0
- package/agents/council/SYNTHESIZER.md +293 -0
- package/agents/council/advogado-do-diabo/AGENT.md +489 -0
- package/agents/council/advogado-do-diabo/SOUL.md +100 -0
- package/agents/council/critico-metodologico/AGENT.md +670 -0
- package/agents/council/critico-metodologico/SOUL.md +107 -0
- package/agents/council/sintetizador/AGENT.md +558 -0
- package/agents/council/sintetizador/SOUL.md +94 -0
- package/agents/persons/_example/AGENT-EXAMPLE.md +42 -0
- package/agents/persons/_example/DNA-EXAMPLE.yaml +61 -0
- package/agents/protocols/AGENT-COGNITION-PROTOCOL.md +779 -0
- package/agents/protocols/AGENT-INTEGRITY-PROTOCOL.md +692 -0
- package/agents/protocols/BATCH-VISUAL-PROTOCOL.md +841 -0
- package/agents/protocols/DNA-CONFIG-TEMPLATE.yaml +181 -0
- package/agents/protocols/DNA-EXTRACTION-PROTOCOL.md +370 -0
- package/agents/protocols/EPISTEMIC-PROTOCOL.md +333 -0
- package/agents/protocols/LOG-STRUCTURE-PROTOCOL.md +65 -0
- package/agents/protocols/MEMORY-PROTOCOL.md +567 -0
- package/agents/protocols/NARRATIVE-SYNTHESIS-PROTOCOL.md +278 -0
- package/agents/protocols/PHASE-4-VERIFICATION-CHECKPOINT.md +146 -0
- package/agents/protocols/SOUL-TEMPLATE.md +416 -0
- package/agents/protocols/TEMPLATE-EVOLUTION-PROTOCOL.md +544 -0
- package/agents/protocols/VISUAL-DIFF-PROTOCOL.md +159 -0
- package/agents/sua-empresa/README.md +44 -0
- package/agents/sua-empresa/_example/jds/EXAMPLE-JD.md +42 -0
- package/agents/sua-empresa/_example/org/EXAMPLE-ORG.md +32 -0
- package/agents/sua-empresa/_example/roles/EXAMPLE-ROLE.md +38 -0
- package/bin/cli.js +2 -0
- package/bin/lib/ascii-art.js +234 -0
- package/bin/lib/installer.js +402 -0
- package/bin/lib/setup-wizard.js +95 -0
- package/bin/lib/validate-email.js +109 -0
- package/bin/mega-brain.js +97 -0
- package/bin/push.js +342 -0
- package/bin/templates/env.example +38 -0
- package/inbox/.gitkeep +0 -0
- package/integrations/README.md +46 -0
- package/integrations/mcps/MCP-REGISTRY.md +56 -0
- package/integrations/mcps/excalidraw/CONFIG.md +56 -0
- package/integrations/mcps/gdrive/CONFIG.md +38 -0
- package/knowledge/dna/.gitkeep +0 -0
- package/knowledge/dossiers/persons/.gitkeep +0 -0
- package/knowledge/dossiers/persons/DOSSIER-EXAMPLE.md +49 -0
- package/knowledge/dossiers/system/.gitkeep +0 -0
- package/knowledge/dossiers/themes/.gitkeep +0 -0
- package/knowledge/playbooks/.gitkeep +0 -0
- package/knowledge/playbooks/PLAYBOOK-EXAMPLE.md +50 -0
- package/knowledge/sources/.gitkeep +0 -0
- package/logs/.gitkeep +0 -0
- package/package.json +128 -0
- package/processing/canonical/.gitkeep +0 -0
- package/processing/chunks/.gitkeep +0 -0
- package/processing/insights/.gitkeep +0 -0
- package/processing/narratives/.gitkeep +0 -0
- package/reference/CONSELHO.md +337 -0
- package/reference/CONTEXT7_README.md +28 -0
- package/reference/JARVIS-LOGGING-PROTOCOL.md +380 -0
- package/reference/QUICK-START.md +197 -0
- package/reference/README-RALPH-CASCATEAMENTO.md +207 -0
- package/reference/TEMPLATE-MASTER.md +727 -0
- package/reference/prds/prd-jarvis-mega-brain-v3.md +1305 -0
- package/reference/templates/phase5/IMPLEMENTATION-GUIDE.md +355 -0
- package/reference/templates/phase5/MOGA-BRAIN-PHASE5-TEMPLATES.md +1284 -0
- package/reference/templates/phase5/README.md +165 -0
- package/reference/workflow-claude-code-boris-cherny-continuous-claude.md +2232 -0
- package/system/database/001_moneyclub_buyers.sql +160 -0
- package/system/database/002_premium_token.sql +97 -0
- package/system/database/apply-migration.mjs +129 -0
- package/system/docs/MEGA-BRAIN-DEMO-COMPLETA.md +1226 -0
- package/system/docs/MEGA-BRAIN-MANIFESTO-COMPLETO.md +1054 -0
- package/system/docs/MOGA-BRAIN-EXPLICACAO-COMPLETA.md +791 -0
- package/system/docs/STRATEGIC-INTEGRATION-GUIDE.md +725 -0
- package/system/docs/architecture/01-system-context.md +136 -0
- package/system/docs/architecture/02-components.md +225 -0
- package/system/docs/architecture/03-data-flow.md +235 -0
- package/system/docs/architecture/04-integrations.md +283 -0
- package/system/docs/architecture/README.md +71 -0
- package/system/docs/architecture/diagrams/component-diagram.mmd +50 -0
- package/system/docs/architecture/diagrams/data-flow.mmd +39 -0
- package/system/docs/architecture/diagrams/system-overview.mmd +68 -0
- package/system/protocols/AGENT-AUTHORITY.md +217 -0
- package/system/protocols/CONSTITUICAO-BASE.md +115 -0
- package/system/protocols/CONSTITUTION.md +231 -0
- package/system/protocols/GOVERNANCE-MAP.md +123 -0
- package/system/protocols/HOOK-SECURITY-THREAT-MODEL.md +152 -0
- package/system/protocols/ORQUESTRACAO-PROTOCOL.md +215 -0
- package/system/protocols/_archive/CHUNKING-PROTOCOL.md +207 -0
- package/system/protocols/_archive/ENTITY-RESOLUTION-PROTOCOL.md +269 -0
- package/system/protocols/_archive/INSIGHT-EXTRACTION-PROTOCOL.md +257 -0
- package/system/protocols/_archive/NARRATIVE-SYNTHESIS-PROTOCOL.md +290 -0
- package/system/protocols/agents/AGENT-INTERACTION.md +315 -0
- package/system/protocols/agents/CORTEX-PROTOCOL.md +520 -0
- package/system/protocols/agents/EPISTEMIC-PROTOCOL.md +465 -0
- package/system/protocols/agents/MEMORY-PROTOCOL.md +366 -0
- package/system/protocols/agents/WAR-ROOM.md +355 -0
- package/system/protocols/company/COMPANY-DOCUMENT-PROTOCOL.md +793 -0
- package/system/protocols/company/COMPANY-ENRICHMENT-PROTOCOL.md +679 -0
- package/system/protocols/conclave/CONCLAVE-LOG-TEMPLATE-v2.md +309 -0
- package/system/protocols/conclave/CONCLAVE-PROTOCOL.md +518 -0
- package/system/protocols/conclave/DEBATE-DYNAMICS-CONFIG.yaml +322 -0
- package/system/protocols/conclave/DEBATE-DYNAMICS-PROTOCOL.md +613 -0
- package/system/protocols/conclave/DEBATE-PROTOCOL.md +323 -0
- package/system/protocols/council/COUNCIL-LOG-TEMPLATE-v2.md +309 -0
- package/system/protocols/council/COUNCIL-PROTOCOL.md +518 -0
- package/system/protocols/council/DEBATE-DYNAMICS-CONFIG.yaml +322 -0
- package/system/protocols/council/DEBATE-DYNAMICS-PROTOCOL.md +613 -0
- package/system/protocols/council/DEBATE-PROTOCOL.md +323 -0
- package/system/protocols/dna/DNA-EXTRACTION-PROTOCOL.md +1214 -0
- package/system/protocols/dna/ENRICHMENT-PROTOCOL.md +408 -0
- package/system/protocols/dna/REASONING-MODEL-PROTOCOL.md +331 -0
- package/system/protocols/pipeline/DOSSIER-COMPILATION-PROTOCOL.md +790 -0
- package/system/protocols/pipeline/NARRATIVE-METABOLISM-PROTOCOL.md +292 -0
- package/system/protocols/pipeline/PIPELINE-JARVIS-v2.1.md +606 -0
- package/system/protocols/pipeline/PROMPT-1.1-CHUNKING.md +154 -0
- package/system/protocols/pipeline/PROMPT-1.2-ENTITY-RESOLUTION.md +186 -0
- package/system/protocols/pipeline/PROMPT-2.1-DNA-TAGS-INCREMENT.md +208 -0
- package/system/protocols/pipeline/PROMPT-2.1-INSIGHT-EXTRACTION.md +191 -0
- package/system/protocols/pipeline/PROMPT-3.1-NARRATIVE-SYNTHESIS.md +331 -0
- package/system/protocols/pipeline/SOURCES-COMPILATION-PROTOCOL.md +340 -0
- package/system/protocols/system/AUTO-LOG-PROTOCOL.md +369 -0
- package/system/protocols/system/CHECKPOINT-ENFORCEMENT.md +176 -0
- package/system/protocols/system/ENFORCEMENT.md +435 -0
- package/system/protocols/system/LOG-TEMPLATES.md +1068 -0
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
FASE 2.5 - Script de Tagueamento de Arquivos INBOX
|
|
4
|
+
Mega Brain - Sistema de Inteligencia de Negocios
|
|
5
|
+
|
|
6
|
+
Este script:
|
|
7
|
+
1. Le a planilha de controle e extrai mapeamento nome -> TAG
|
|
8
|
+
2. Varre todos os arquivos do INBOX recursivamente
|
|
9
|
+
3. Faz matching entre arquivos e TAGs
|
|
10
|
+
4. Renomeia arquivos com prefixo [TAG]
|
|
11
|
+
5. Gera relatorio de arquivos processados vs orfaos
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import os
|
|
15
|
+
import re
|
|
16
|
+
import json
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from datetime import datetime
|
|
19
|
+
|
|
20
|
+
# Configuracoes
|
|
21
|
+
INBOX_PATH = "inbox"
|
|
22
|
+
SCHEMA_PATH = ".claude/mission-control/SPREADSHEET-SCHEMA.json"
|
|
23
|
+
OUTPUT_PATH = ".claude/mission-control/TAG-MAPPING-REPORT.json"
|
|
24
|
+
|
|
25
|
+
# Mapeamento de pasta INBOX -> prefixo TAG
|
|
26
|
+
FOLDER_TO_PREFIX = {
|
|
27
|
+
"JEREMY MINER": "JM",
|
|
28
|
+
"JEREMY HAYNES": ["JH-ST", "JH-IC", "JH-WK", "AOBA", "PCVP", "LYFC", "MMM", "30DC", "STA", "UHTC"],
|
|
29
|
+
"THE SCALABLE COMPANY": "TSC",
|
|
30
|
+
"ALEX HORMOZI": "AH",
|
|
31
|
+
"JEREMY HAYNES PROGRAM": "CA",
|
|
32
|
+
"SAM OVEN (SETTERLUN UNIVERSITY)": None, # Nao tem na planilha ainda
|
|
33
|
+
"SETTERLUN (SETTERLUN UNIVERSITY)": None,
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
def extract_number_from_filename(filename):
|
|
37
|
+
"""Extrai o numero do inicio do nome do arquivo."""
|
|
38
|
+
# Padroes comuns:
|
|
39
|
+
# "6 - 42 Minutes of Sales Training.txt"
|
|
40
|
+
# "112. How To Get Prospects.txt"
|
|
41
|
+
# "44 - LIVE CALL A Masterclass.txt"
|
|
42
|
+
|
|
43
|
+
patterns = [
|
|
44
|
+
r'^(\d+)\s*[-\.]\s*', # "123 - " ou "123. "
|
|
45
|
+
r'^(\d+)\.\s*', # "123. "
|
|
46
|
+
r'^(\d+)\s+', # "123 "
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
for pattern in patterns:
|
|
50
|
+
match = re.match(pattern, filename)
|
|
51
|
+
if match:
|
|
52
|
+
return int(match.group(1))
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
def clean_filename_for_matching(filename):
|
|
56
|
+
"""Limpa o nome do arquivo para facilitar matching."""
|
|
57
|
+
# Remove extensao
|
|
58
|
+
name = os.path.splitext(filename)[0]
|
|
59
|
+
# Remove timestamps no final
|
|
60
|
+
name = re.sub(r'_\d{14}$', '', name)
|
|
61
|
+
# Remove numero inicial
|
|
62
|
+
name = re.sub(r'^\d+\s*[-\.]\s*', '', name)
|
|
63
|
+
# Lowercase e remove caracteres especiais
|
|
64
|
+
name = name.lower()
|
|
65
|
+
name = re.sub(r'[^\w\s]', ' ', name)
|
|
66
|
+
name = re.sub(r'\s+', ' ', name).strip()
|
|
67
|
+
return name
|
|
68
|
+
|
|
69
|
+
def determine_prefix_from_path(filepath):
|
|
70
|
+
"""Determina o prefixo TAG baseado no caminho do arquivo."""
|
|
71
|
+
path_str = str(filepath).upper()
|
|
72
|
+
|
|
73
|
+
for folder, prefix in FOLDER_TO_PREFIX.items():
|
|
74
|
+
if folder in path_str:
|
|
75
|
+
return prefix
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
def scan_inbox_files():
|
|
79
|
+
"""Varre todos os arquivos do INBOX recursivamente."""
|
|
80
|
+
files = []
|
|
81
|
+
extensions = {'.txt', '.docx', '.pdf'}
|
|
82
|
+
|
|
83
|
+
for root, dirs, filenames in os.walk(INBOX_PATH):
|
|
84
|
+
# Ignorar pastas de backup/template
|
|
85
|
+
if '_BACKUP' in root or '_TEMPLATE' in root:
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
for filename in filenames:
|
|
89
|
+
ext = os.path.splitext(filename)[1].lower()
|
|
90
|
+
if ext in extensions:
|
|
91
|
+
filepath = Path(root) / filename
|
|
92
|
+
|
|
93
|
+
# Verificar se ja tem TAG no nome
|
|
94
|
+
has_tag = bool(re.match(r'^\[[\w-]+\]', filename))
|
|
95
|
+
|
|
96
|
+
files.append({
|
|
97
|
+
'path': str(filepath),
|
|
98
|
+
'filename': filename,
|
|
99
|
+
'folder': os.path.basename(root),
|
|
100
|
+
'parent_folder': os.path.basename(os.path.dirname(root)),
|
|
101
|
+
'number': extract_number_from_filename(filename),
|
|
102
|
+
'clean_name': clean_filename_for_matching(filename),
|
|
103
|
+
'has_tag': has_tag,
|
|
104
|
+
'suggested_prefix': determine_prefix_from_path(filepath)
|
|
105
|
+
})
|
|
106
|
+
|
|
107
|
+
return files
|
|
108
|
+
|
|
109
|
+
def generate_tag_from_number(prefix, number):
|
|
110
|
+
"""Gera TAG no formato [PREFIX]-[NNNN]."""
|
|
111
|
+
if isinstance(prefix, list):
|
|
112
|
+
prefix = prefix[0] # Usar primeiro prefixo como padrao
|
|
113
|
+
return f"{prefix}-{number:04d}"
|
|
114
|
+
|
|
115
|
+
def rename_file_with_tag(filepath, tag):
|
|
116
|
+
"""Renomeia arquivo adicionando prefixo [TAG]."""
|
|
117
|
+
path = Path(filepath)
|
|
118
|
+
new_name = f"[{tag}] {path.name}"
|
|
119
|
+
new_path = path.parent / new_name
|
|
120
|
+
|
|
121
|
+
# Verificar se destino ja existe
|
|
122
|
+
if new_path.exists():
|
|
123
|
+
return None, "Destino ja existe"
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
path.rename(new_path)
|
|
127
|
+
return str(new_path), None
|
|
128
|
+
except Exception as e:
|
|
129
|
+
return None, str(e)
|
|
130
|
+
|
|
131
|
+
def execute_rename(report, dry_run=False):
|
|
132
|
+
"""Executa a renomeacao dos arquivos."""
|
|
133
|
+
print()
|
|
134
|
+
print("=" * 60)
|
|
135
|
+
print("EXECUTANDO RENOMEACAO" + (" (DRY RUN)" if dry_run else ""))
|
|
136
|
+
print("=" * 60)
|
|
137
|
+
print()
|
|
138
|
+
|
|
139
|
+
success = 0
|
|
140
|
+
errors = []
|
|
141
|
+
|
|
142
|
+
for item in report['to_tag']:
|
|
143
|
+
filepath = item['current_path']
|
|
144
|
+
tag = item['suggested_tag']
|
|
145
|
+
|
|
146
|
+
if dry_run:
|
|
147
|
+
path = Path(filepath)
|
|
148
|
+
new_name = f"[{tag}] {path.name}"
|
|
149
|
+
print(f" [DRY] {path.name}")
|
|
150
|
+
print(f" -> {new_name}")
|
|
151
|
+
success += 1
|
|
152
|
+
else:
|
|
153
|
+
new_path, error = rename_file_with_tag(filepath, tag)
|
|
154
|
+
if error:
|
|
155
|
+
errors.append({'path': filepath, 'error': error})
|
|
156
|
+
print(f" [ERRO] {item['filename']}: {error}")
|
|
157
|
+
else:
|
|
158
|
+
success += 1
|
|
159
|
+
if success % 50 == 0:
|
|
160
|
+
print(f" Renomeados: {success}/{len(report['to_tag'])}")
|
|
161
|
+
|
|
162
|
+
print()
|
|
163
|
+
print("=" * 60)
|
|
164
|
+
print("RESULTADO")
|
|
165
|
+
print("=" * 60)
|
|
166
|
+
print(f" Sucesso: {success}")
|
|
167
|
+
print(f" Erros: {len(errors)}")
|
|
168
|
+
|
|
169
|
+
if errors:
|
|
170
|
+
print()
|
|
171
|
+
print("ERROS:")
|
|
172
|
+
for e in errors[:10]: # Mostrar apenas primeiros 10
|
|
173
|
+
print(f" - {e['path']}: {e['error']}")
|
|
174
|
+
if len(errors) > 10:
|
|
175
|
+
print(f" ... e mais {len(errors) - 10} erros")
|
|
176
|
+
|
|
177
|
+
return success, errors
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def main(execute=False, dry_run=False):
|
|
181
|
+
print("=" * 60)
|
|
182
|
+
print("FASE 2.5 - TAGUEAMENTO DE ARQUIVOS INBOX")
|
|
183
|
+
print("=" * 60)
|
|
184
|
+
print()
|
|
185
|
+
|
|
186
|
+
# 1. Carregar schema
|
|
187
|
+
print("[1/4] Carregando schema...")
|
|
188
|
+
with open(SCHEMA_PATH, 'r', encoding='utf-8') as f:
|
|
189
|
+
schema = json.load(f)
|
|
190
|
+
|
|
191
|
+
# 2. Escanear arquivos
|
|
192
|
+
print("[2/4] Escaneando INBOX...")
|
|
193
|
+
files = scan_inbox_files()
|
|
194
|
+
print(f" Encontrados: {len(files)} arquivos")
|
|
195
|
+
|
|
196
|
+
# 3. Classificar arquivos
|
|
197
|
+
print("[3/4] Classificando arquivos...")
|
|
198
|
+
|
|
199
|
+
already_tagged = [f for f in files if f['has_tag']]
|
|
200
|
+
to_tag = [f for f in files if not f['has_tag'] and f['suggested_prefix'] and f['number']]
|
|
201
|
+
orphans = [f for f in files if not f['has_tag'] and (not f['suggested_prefix'] or not f['number'])]
|
|
202
|
+
|
|
203
|
+
print(f" Ja tagueados: {len(already_tagged)}")
|
|
204
|
+
print(f" Para taguear: {len(to_tag)}")
|
|
205
|
+
print(f" Orfaos: {len(orphans)}")
|
|
206
|
+
|
|
207
|
+
# 4. Gerar relatorio
|
|
208
|
+
print("[4/4] Gerando relatorio...")
|
|
209
|
+
|
|
210
|
+
report = {
|
|
211
|
+
'timestamp': datetime.now().isoformat(),
|
|
212
|
+
'summary': {
|
|
213
|
+
'total_files': len(files),
|
|
214
|
+
'already_tagged': len(already_tagged),
|
|
215
|
+
'to_tag': len(to_tag),
|
|
216
|
+
'orphans': len(orphans)
|
|
217
|
+
},
|
|
218
|
+
'to_tag': [],
|
|
219
|
+
'orphans': []
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
for f in to_tag:
|
|
223
|
+
prefix = f['suggested_prefix']
|
|
224
|
+
if isinstance(prefix, list):
|
|
225
|
+
prefix = prefix[0]
|
|
226
|
+
tag = generate_tag_from_number(prefix, f['number'])
|
|
227
|
+
report['to_tag'].append({
|
|
228
|
+
'current_path': f['path'],
|
|
229
|
+
'suggested_tag': tag,
|
|
230
|
+
'filename': f['filename'],
|
|
231
|
+
'folder': f['folder']
|
|
232
|
+
})
|
|
233
|
+
|
|
234
|
+
for f in orphans:
|
|
235
|
+
report['orphans'].append({
|
|
236
|
+
'path': f['path'],
|
|
237
|
+
'filename': f['filename'],
|
|
238
|
+
'folder': f['folder'],
|
|
239
|
+
'reason': 'Sem prefixo conhecido' if not f['suggested_prefix'] else 'Sem numero no nome'
|
|
240
|
+
})
|
|
241
|
+
|
|
242
|
+
# Salvar relatorio
|
|
243
|
+
with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
|
|
244
|
+
json.dump(report, f, ensure_ascii=False, indent=2)
|
|
245
|
+
|
|
246
|
+
print()
|
|
247
|
+
print("=" * 60)
|
|
248
|
+
print("RELATORIO GERADO")
|
|
249
|
+
print("=" * 60)
|
|
250
|
+
print(f"Arquivo: {OUTPUT_PATH}")
|
|
251
|
+
|
|
252
|
+
# 5. Executar renomeacao se solicitado
|
|
253
|
+
if execute:
|
|
254
|
+
success, errors = execute_rename(report, dry_run)
|
|
255
|
+
report['execution'] = {
|
|
256
|
+
'success': success,
|
|
257
|
+
'errors': len(errors),
|
|
258
|
+
'error_details': errors
|
|
259
|
+
}
|
|
260
|
+
# Atualizar relatorio com resultado
|
|
261
|
+
with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
|
|
262
|
+
json.dump(report, f, ensure_ascii=False, indent=2)
|
|
263
|
+
else:
|
|
264
|
+
print()
|
|
265
|
+
print("PROXIMO PASSO: Executar com --execute para renomear")
|
|
266
|
+
print(" Ou --dry-run para simular")
|
|
267
|
+
|
|
268
|
+
print()
|
|
269
|
+
return report
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
if __name__ == '__main__':
|
|
273
|
+
import sys
|
|
274
|
+
execute = '--execute' in sys.argv
|
|
275
|
+
dry_run = '--dry-run' in sys.argv
|
|
276
|
+
main(execute=execute, dry_run=dry_run)
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
FASE 2.5 v2 - Tagueamento por MATCHING DE NOMES
|
|
4
|
+
Mega Brain - Sistema de Inteligência de Negócios
|
|
5
|
+
|
|
6
|
+
ABORDAGEM CORRETA:
|
|
7
|
+
1. Extrair de TODAS as abas: nome_video → TAG
|
|
8
|
+
2. Para cada arquivo INBOX, fazer matching por nome similar
|
|
9
|
+
3. Se match encontrado, usar TAG da planilha
|
|
10
|
+
4. Se não, manter órfão
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
import re
|
|
15
|
+
import json
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from datetime import datetime
|
|
18
|
+
from difflib import SequenceMatcher
|
|
19
|
+
|
|
20
|
+
# Configurações
|
|
21
|
+
INBOX_PATH = "inbox"
|
|
22
|
+
SCHEMA_PATH = ".claude/mission-control/SPREADSHEET-SCHEMA.json"
|
|
23
|
+
INDEX_PATH = ".claude/mission-control/PLANILHA-INDEX.json"
|
|
24
|
+
OUTPUT_PATH = ".claude/mission-control/TAG-MAPPING-V2.json"
|
|
25
|
+
|
|
26
|
+
def normalize_name(name):
|
|
27
|
+
"""Normaliza nome para matching."""
|
|
28
|
+
# Remove extensão
|
|
29
|
+
name = os.path.splitext(name)[0]
|
|
30
|
+
# Lowercase
|
|
31
|
+
name = name.lower()
|
|
32
|
+
# Remove timestamps
|
|
33
|
+
name = re.sub(r'_\d{14}$', '', name)
|
|
34
|
+
name = re.sub(r'\d{1,2}-\d{1,2}-\d{2,4}', '', name) # Remove datas tipo 12-25-24
|
|
35
|
+
# Remove número inicial
|
|
36
|
+
name = re.sub(r'^\d+[\s\.\-]+', '', name)
|
|
37
|
+
# Remove [youtube.com...] e similares
|
|
38
|
+
name = re.sub(r'\[youtube\.com[^\]]*\]', '', name)
|
|
39
|
+
name = re.sub(r'\[[^\]]*\]', '', name)
|
|
40
|
+
# Remove (1), (2), etc
|
|
41
|
+
name = re.sub(r'\s*\(\d+\)\s*', '', name)
|
|
42
|
+
# Remove caracteres especiais
|
|
43
|
+
name = re.sub(r'[^\w\s]', ' ', name)
|
|
44
|
+
name = re.sub(r'\s+', ' ', name).strip()
|
|
45
|
+
return name
|
|
46
|
+
|
|
47
|
+
def similar(a, b):
|
|
48
|
+
"""Calcula similaridade entre dois nomes (0-1)."""
|
|
49
|
+
return SequenceMatcher(None, a, b).ratio()
|
|
50
|
+
|
|
51
|
+
def load_planilha_index():
|
|
52
|
+
"""Carrega índice da planilha (nome → TAG)."""
|
|
53
|
+
if os.path.exists(INDEX_PATH):
|
|
54
|
+
with open(INDEX_PATH, 'r', encoding='utf-8') as f:
|
|
55
|
+
return json.load(f)
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
def scan_inbox_files():
|
|
59
|
+
"""Varre todos os arquivos do INBOX."""
|
|
60
|
+
files = []
|
|
61
|
+
extensions = {'.txt', '.docx', '.pdf'}
|
|
62
|
+
|
|
63
|
+
for root, dirs, filenames in os.walk(INBOX_PATH):
|
|
64
|
+
if '_BACKUP' in root or '_TEMPLATE' in root:
|
|
65
|
+
continue
|
|
66
|
+
|
|
67
|
+
for filename in filenames:
|
|
68
|
+
ext = os.path.splitext(filename)[1].lower()
|
|
69
|
+
if ext in extensions:
|
|
70
|
+
filepath = Path(root) / filename
|
|
71
|
+
|
|
72
|
+
# Verificar se já tem TAG
|
|
73
|
+
has_tag = bool(re.match(r'^\[[\w-]+\]', filename))
|
|
74
|
+
|
|
75
|
+
files.append({
|
|
76
|
+
'path': str(filepath),
|
|
77
|
+
'filename': filename,
|
|
78
|
+
'folder': os.path.basename(root),
|
|
79
|
+
'parent_folder': os.path.basename(os.path.dirname(root)),
|
|
80
|
+
'normalized': normalize_name(filename),
|
|
81
|
+
'has_tag': has_tag
|
|
82
|
+
})
|
|
83
|
+
|
|
84
|
+
return files
|
|
85
|
+
|
|
86
|
+
def match_file_to_index(file_info, index, threshold=0.7):
|
|
87
|
+
"""Tenta encontrar match no índice da planilha."""
|
|
88
|
+
normalized_name = file_info['normalized']
|
|
89
|
+
|
|
90
|
+
if not normalized_name:
|
|
91
|
+
return None, 0
|
|
92
|
+
|
|
93
|
+
best_match = None
|
|
94
|
+
best_score = 0
|
|
95
|
+
|
|
96
|
+
for entry in index['entries']:
|
|
97
|
+
# Comparar com nome normalizado da planilha
|
|
98
|
+
score = similar(normalized_name, entry['normalized'])
|
|
99
|
+
if score > best_score and score >= threshold:
|
|
100
|
+
best_score = score
|
|
101
|
+
best_match = entry
|
|
102
|
+
|
|
103
|
+
return best_match, best_score
|
|
104
|
+
|
|
105
|
+
def main(execute=False, threshold=0.7):
|
|
106
|
+
print("=" * 60)
|
|
107
|
+
print("FASE 2.5 v2 - TAGUEAMENTO POR MATCHING")
|
|
108
|
+
print("=" * 60)
|
|
109
|
+
print()
|
|
110
|
+
|
|
111
|
+
# 1. Carregar índice da planilha
|
|
112
|
+
print("[1/4] Carregando índice da planilha...")
|
|
113
|
+
index = load_planilha_index()
|
|
114
|
+
if not index:
|
|
115
|
+
print(" ERRO: Índice não encontrado!")
|
|
116
|
+
print(" Execute primeiro: /criar-indice-planilha")
|
|
117
|
+
return None
|
|
118
|
+
print(f" {len(index['entries'])} entradas no índice")
|
|
119
|
+
|
|
120
|
+
# 2. Escanear INBOX
|
|
121
|
+
print("[2/4] Escaneando INBOX...")
|
|
122
|
+
files = scan_inbox_files()
|
|
123
|
+
print(f" {len(files)} arquivos encontrados")
|
|
124
|
+
|
|
125
|
+
# 3. Fazer matching
|
|
126
|
+
print(f"[3/4] Matching (threshold={threshold})...")
|
|
127
|
+
|
|
128
|
+
matched = []
|
|
129
|
+
orphans = []
|
|
130
|
+
already_tagged = []
|
|
131
|
+
|
|
132
|
+
for f in files:
|
|
133
|
+
if f['has_tag']:
|
|
134
|
+
already_tagged.append(f)
|
|
135
|
+
continue
|
|
136
|
+
|
|
137
|
+
match, score = match_file_to_index(f, index, threshold)
|
|
138
|
+
if match:
|
|
139
|
+
matched.append({
|
|
140
|
+
'file': f,
|
|
141
|
+
'match': match,
|
|
142
|
+
'score': score
|
|
143
|
+
})
|
|
144
|
+
else:
|
|
145
|
+
orphans.append(f)
|
|
146
|
+
|
|
147
|
+
print(f" Já tagueados: {len(already_tagged)}")
|
|
148
|
+
print(f" Match encontrado: {len(matched)}")
|
|
149
|
+
print(f" Órfãos: {len(orphans)}")
|
|
150
|
+
|
|
151
|
+
# 4. Gerar relatório
|
|
152
|
+
print("[4/4] Gerando relatório...")
|
|
153
|
+
|
|
154
|
+
report = {
|
|
155
|
+
'timestamp': datetime.now().isoformat(),
|
|
156
|
+
'threshold': threshold,
|
|
157
|
+
'summary': {
|
|
158
|
+
'total_files': len(files),
|
|
159
|
+
'already_tagged': len(already_tagged),
|
|
160
|
+
'matched': len(matched),
|
|
161
|
+
'orphans': len(orphans)
|
|
162
|
+
},
|
|
163
|
+
'matches': [],
|
|
164
|
+
'orphans': []
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
for m in matched:
|
|
168
|
+
report['matches'].append({
|
|
169
|
+
'current_path': m['file']['path'],
|
|
170
|
+
'filename': m['file']['filename'],
|
|
171
|
+
'folder': m['file']['folder'],
|
|
172
|
+
'matched_to': m['match']['original_name'],
|
|
173
|
+
'tag': m['match']['tag'],
|
|
174
|
+
'sheet': m['match']['sheet'],
|
|
175
|
+
'score': round(m['score'], 3)
|
|
176
|
+
})
|
|
177
|
+
|
|
178
|
+
for o in orphans:
|
|
179
|
+
report['orphans'].append({
|
|
180
|
+
'path': o['path'],
|
|
181
|
+
'filename': o['filename'],
|
|
182
|
+
'folder': o['folder'],
|
|
183
|
+
'normalized': o['normalized']
|
|
184
|
+
})
|
|
185
|
+
|
|
186
|
+
# Salvar relatório
|
|
187
|
+
with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
|
|
188
|
+
json.dump(report, f, ensure_ascii=False, indent=2)
|
|
189
|
+
|
|
190
|
+
print()
|
|
191
|
+
print("=" * 60)
|
|
192
|
+
print("RELATÓRIO GERADO")
|
|
193
|
+
print("=" * 60)
|
|
194
|
+
print(f"Arquivo: {OUTPUT_PATH}")
|
|
195
|
+
|
|
196
|
+
# 5. Executar renomeação se solicitado
|
|
197
|
+
if execute and matched:
|
|
198
|
+
print()
|
|
199
|
+
print("=" * 60)
|
|
200
|
+
print("EXECUTANDO RENOMEAÇÃO")
|
|
201
|
+
print("=" * 60)
|
|
202
|
+
|
|
203
|
+
success = 0
|
|
204
|
+
errors = []
|
|
205
|
+
|
|
206
|
+
for m in matched:
|
|
207
|
+
filepath = Path(m['file']['path'])
|
|
208
|
+
tag = m['match']['tag']
|
|
209
|
+
new_name = f"[{tag}] {filepath.name}"
|
|
210
|
+
new_path = filepath.parent / new_name
|
|
211
|
+
|
|
212
|
+
if new_path.exists():
|
|
213
|
+
errors.append({'path': str(filepath), 'error': 'Destino já existe'})
|
|
214
|
+
continue
|
|
215
|
+
|
|
216
|
+
try:
|
|
217
|
+
filepath.rename(new_path)
|
|
218
|
+
success += 1
|
|
219
|
+
if success % 50 == 0:
|
|
220
|
+
print(f" Renomeados: {success}/{len(matched)}")
|
|
221
|
+
except Exception as e:
|
|
222
|
+
errors.append({'path': str(filepath), 'error': str(e)})
|
|
223
|
+
|
|
224
|
+
print()
|
|
225
|
+
print(f"Sucesso: {success}")
|
|
226
|
+
print(f"Erros: {len(errors)}")
|
|
227
|
+
|
|
228
|
+
report['execution'] = {
|
|
229
|
+
'success': success,
|
|
230
|
+
'errors': len(errors),
|
|
231
|
+
'error_details': errors
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
|
|
235
|
+
json.dump(report, f, ensure_ascii=False, indent=2)
|
|
236
|
+
|
|
237
|
+
else:
|
|
238
|
+
print()
|
|
239
|
+
print("PRÓXIMO PASSO: Execute com --execute para renomear")
|
|
240
|
+
print(f" Threshold atual: {threshold}")
|
|
241
|
+
print(" Use --threshold=0.6 para matching mais flexível")
|
|
242
|
+
|
|
243
|
+
return report
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
if __name__ == '__main__':
|
|
247
|
+
import sys
|
|
248
|
+
execute = '--execute' in sys.argv
|
|
249
|
+
threshold = 0.7
|
|
250
|
+
for arg in sys.argv:
|
|
251
|
+
if arg.startswith('--threshold='):
|
|
252
|
+
threshold = float(arg.split('=')[1])
|
|
253
|
+
main(execute=execute, threshold=threshold)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Test extraction of text from base64-encoded .docx
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import base64
|
|
7
|
+
import subprocess
|
|
8
|
+
import tempfile
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
# Base64 content from Google Drive (truncated for test - will use full content)
|
|
12
|
+
base64_content = """UEsDBBQACAgIACc2JVwAAAAAAAAAAAAAAAASAAAAd29yZC9udW1iZXJpbmcueG1s7VrLjtowFP2C/gOK1OWQJ4FBA7PoaKpWo6pqpx9gEkMs/IhsB2a+oYvu2m2/rV9SJyHhTRMaBEheBe71PfYxh3t0pdzdvxDcmkEuEKMDw25bRgvSgIWITgbGt+fHm57REhLQEGBG4cB4hcK4H765m/dpQkaQq3UtBUFFnwQDI5Iy7pumCCJIgGizGFKVHDNOgFRf+cQkgE+T+CZgJAYSjRBG8tV0LMs3FjBsYCSc9hcQNwQFnAk2lmlJn43HKICLR1HBq+yblzywICGQymxHk0OszsCoiFAsCjRyLJpKRgXI7BCJGcHFunlcZbeQg7m6Z4LzjeaMhzFnARRCRR/yZIloWxUuMIUoK6ocYX3P4iQEIFrCpOrYACr3bqu9F5eWQS2JLO9C4CoHyVNPaMQBf90+BTjiPlfrY1RJxRsIqkomvBTkMRBBBLgsAPAxCJgFUxi+A3QGSjGHk0py3kAKEZhwQJYiFbV+WdvakMvXCMRwiTb5P7T3nCXxUu7eMWgr/0C7Uw/A2QLw6wH0tgEC+HIchqkqV3FQWA/HL3FQ0Q+qAUggpiJFuDXDRRfMQuVfYLsp78RRH9PaFMqxVEMGQdaQjaEyGDASkqvAp4S01r59UCztbAmeYZVC6jEwrCyivIpLFZsBnC4yh7lTPZIyGMIAEYDzlKp8hi9l7q3dLuMfgyKK4Vjm4fgzTx+IhiqXhgdG11GWOe9HgE4yz3R9K11rlot5/kgKNKp8dLEiS5nZbpts7IpsMJtD/gSlhHw3I6c2I9vzTkLJqUPpCyOA7mbk1mbk2P5JGLlNSM6rT6fXOwkdrynNdWpTUgxOQqnTkOb82ow89zSNwW9Cc93adDrWaZpCtynN9epT6p6mK/Qa0txtbUa+11RjMNcM959u7BztxqMEYyh3XsCfXz+uzIwPkvl+bT58iM3P39fmwc0K7dwW3KzSzu2+zSrt3M7brNLO7bzNKu3cxtus0s7tuc0q7dy+23rq1dPvtTivnn4v3X/19HvhJqyn30t2Y09Pv3r6vWwP1tPv5bqvnn4v13j19HuBftvRfqv9Vvut9lvtt9pvtd824bc081m6+m7Vmumu0TSzlVtlzv4y50CZu7/MPVDm7S/zDpR19pd1VsvMlRecg39BLRwiffbMDUQIVcjZ93vxiwcNT9w4T8y1SycPygam3DNJkExw1DDBjTPBadmcct6QoAtFA4PkhWcNL9w4L7RJRGPGNfXvCG7s5I8OevMgU4ZHM3NCUteDPUTGeLrRMMbNb0LX2wi6iGjKZQJGg+WuxI9DbZX5csgyRUGfIPli44ab54spAPfBVUi9IPlg65b44II38s4u+SIIg3SBQ28uID6DXPfgY/V3wmX/9AaD09fxm6+Wy/7o9Oxk/2zyT193909JMDntnh4dvdgP0m3/tAl/V0LmR0u9jjZvxCwJLjpyCyL3mlqggSRdCX90ntkFJvic7Nghx3MOW+6eT6VgToNdHWsX8sEy/VeWowiRkx9/Rk5eEWvrFvj7pCgz8JuyFkFA/H2CC5LmguoxNh4wOflRA9cbwlIfp3C8nhbBzH4uYxokg+82DP4ZGPx59/iQPHjOB4nT4dK2yLEakX1raZSgO+qLgDj+BUU17pS59KlwVFgKvI263NCMDRRwvYpC7bbydDmwRsPjv4zHiz7J5ExJFhA3vzFUu7pXaIdopiI8K9oIgRLPJx8k9S5A2tO+lYAHooTI18shUBq+/mV8/VTFYnIMP3pOBwyrlSKcNyAG78bw/nCV2zQxXKIFgqXdHg6Z7uEX4HvfmdhxfYh8vRyapuHrX+jH8W0KLrY7dY1OA2LvQzA1pLEauHu+sNylNnEhMvZyIKCGsX8pYyuHGhPkVcZkWMr6GX3P5qwM7zGBR+XvqQauRleJNO7jwlbSoWJD5PDlIE0Nh/8yDp9DQnYl7CRLJOTUgRIQx59dBegESwU0uekXljjtGSVyS0Pk8OXwWj/P4asZar01Zn5GuRZjeNtHoOoKbj5jUSI5ME1A7PyCS56itvabxkhz+Eh7TPs+tDjG5TxzhsjTwVeduRNhsYMbrHByHdlbvTDym7M3J/shBo4Pgq+acjckZAdLfNw9CdkmJ0XW3OtZaf8QV3F7KRNp+26G/z89uD+NKr6eC46TB9j0lqakj3XeiNKkL/BTiLC0g/ZnZIAVMZw/vc40ejB87ckj7NhByYOywn6LsHcRtyxuYekXDn8kXKPyILGqBzu3xByruYFaIkMeNshH0zYtpXNtWk4YFeQBzZgE5YHgGiroUNMw2aDp3VIJme/ocev09CzqV1oN4IspyvseoOlQaJBG1XJAnXZjVPmyrXLyIeKqQ2bWlS6BXGEisx6WQ63czILf1ZfnC4Y8UACaEIQV4sIvB+toFv6je+fVdgH9KdL5inZVDEzlCN8AWC4gRGZYDgvRMMPHmOEZwxoBmPTbIcdsipIJHhhzsBy+4eeZoSZbp2dKCDV6mGcdciRjptnkTxRxraAPLHsPJgFPWcypDTa8fhB8T5C7sWm6wWYXd2/TdLh/+OqYPOu+3H9xcNS91E0pkCVcriDHTrNdmjmZGGKSCt8SvB25ZuFqxsPlKmTczHrf3fdlhJkdBC5RujPtLOg7RUcqzTQi04rO0aoPL9SI+5OD7S54uFzFjIZPPsYnB7nxLa5UprQt6iWU/PLaqp5gmCyUMZ1S3G4ByxhLbY7VPpGRpOUyp+HyzHLVNX6eZ2pidnflEGF+xYbrYq0N+dG2aiGygFN0oxIyB98R4jZs6zcn+5N/nPxDSA2F/+4xSUE/g0JapbbMz16dvNgPqjHzt5MPMQNbKs4j6nNl8ZWJRYxBG2J94/mGLa0L5QoL87sAmU7+mDLt9CYxWOrQ+a5Mjm/fII3zhZB/5WpvkRSGRdafnw1OseNDAkz2aHe7jfOP4PPmk40n+FlpDuSHJQSqaqqtZ8RscIPiE/eUtSqF09te81iFz7H4V3L5TaNNZ/Y1YfCi09M3dl8pW34t7vAyT8+wsSA8LHbXmHkJymdfy/vHY/chVlGOr9av/h+AUEsHCElAgyk3IAAAeckBAFBLAwQUAAgICAAnNiVcAAAAAAAAAAAAAAAAEQAAAHdvcmQvc2V0dGluZ3MueG1spZZLktowEIZPkDtQ2oMfA2RCjZlFpiZZZFaQA8iSbKvQqyQZD7ePZFs2j1TKkBXS391fy0275ZfXT85mR6INlSIDySIGMyKQxFSUGfi9f58/g5mxUGDIpCAZOBEDXrdfXpqNIdY6LzNzBGE2HGWgslZtosiginBoFlIR4YyF1Bxat9VlxKE+1GqOJFfQ0pwyak9RGsdr0GNkBmotNj1izinS0sjC+pCNLAqKSP8TIvSUvF3Im0Q1J8K2GSNNmDuDFKaiygQaf5TmjFWAHP/1EEfOgl+jpmTDGjau0Jx1iRqpsdISEWOc+tYZB2ISTyigRwwRU45wmTOchEMqBoxvjivQkHvhcvdFa1Hjg4y1MGzKQTrTL5prqE+3p4AP1PM8XtFJXXxFcFG21kNDPoJAFdQ2ANgjBCbRgeDvUBzh0My4nNTOVyRMYakhH5vU3PXPJvFVu+wqqMhIK/+P9kPLWo3tvnyEdvYGJqv7AOkNYH0f4PkWgMjnY4zIRZ5zKL6Psx44NMyDaQALzcF4wrcI91OwlYZX4HYo/5Xjlj7Wo9LYDWSI2oEMtu5+ITwneK9rsj8p8i6d16zZHKEbEwmIvAMmBayZ3cN8Z6UKxq9p3Jm7O2Zc7br7aoCsgFsKyN3curiOPiQm3lRrOr2aPmV0kZPpnQ8iH1CpLm1eJhlgtKxs4vnW7bC7DttNXqa9LW1taWdrNxAhVyPn3S9GLQ3amd9T0J5GbRm05aitgrYatXXQ1l6rXN01o+LgyhCWXi8kY7Ih+Odov5H6eoRvhO0fUEsHCLVCXjdPAgAAaAgAAFBLAwQUAAgICAAnNiVcAAAAAAAAAAAAAAAAEgAAAHdvcmQvZm9udFRhYmxlLnhtbKWVS27bMBCGT9A7CNzblI3USAXLQdEg3XTX9gATkpII84UhZcW3L2Xr4dpBICsrSRz93wzJn8Pt05tWyUGgl9bkZLVMSSIMs1yaMid//7wsHkniAxgOyhqRk6Pw5Gn3ZdtkhTXBJ1FufKZZTqoQXEapZ5XQ4JfWCRODhUUNIX5iSTXgvnYLZrWDIF+lkuFI12m6IR3G5qRGk3WIhZYMrbdFaCWZLQrJRPfoFTgl71nybFmthQmnjBSFijVY4yvpfE/Tc2kxWPWQw0eTOGjV/9e4Kdk4QhP3QqtzosYid2iZ8D6OPp+DA3GVTljAFjEoppTwf86+Eg3SDJjWGVegIfcy5u4W7YQaJzKuhVdTCjmHfslXBDzeVgEz1vNS7+QkF18RoirUOBhyDoJVgKEHqDkEZdle8B9gDjCYmZeT7HxF4hJKBD2a1N+1s6v0yi6/K3BipJWfo/1EW7vR7g9zaBcncPX1PsD6BrC5D/B4C2DibR6DRuUlR/L7OJuBI/t+MA0QwO99S/hGedcFT0PDEbhtyu9y4murbVHrNDZkYKeGTHbd5ZI0mQEdO8t3lKAI3W1pd+vs/gFQSwcIoGOcdbcBAAC3BgAAUEsDBBQACAgIACc2JVwAAAAAAAAAAAAAAAAPAAAAd29yZC9zdHlsZXMueG1s"""
|
|
13
|
+
|
|
14
|
+
# Decode and process
|
|
15
|
+
try:
|
|
16
|
+
docx_bytes = base64.b64decode(base64_content)
|
|
17
|
+
print(f"Decoded {len(docx_bytes)} bytes")
|
|
18
|
+
|
|
19
|
+
# Save to temp file
|
|
20
|
+
with open('/tmp/test_doc.docx', 'wb') as f:
|
|
21
|
+
f.write(docx_bytes)
|
|
22
|
+
|
|
23
|
+
print("Saved to /tmp/test_doc.docx")
|
|
24
|
+
|
|
25
|
+
# Check if it's a valid zip/docx
|
|
26
|
+
import zipfile
|
|
27
|
+
if zipfile.is_zipfile('/tmp/test_doc.docx'):
|
|
28
|
+
print("Valid ZIP/DOCX structure detected")
|
|
29
|
+
with zipfile.ZipFile('/tmp/test_doc.docx', 'r') as z:
|
|
30
|
+
print("Contents:", z.namelist()[:5])
|
|
31
|
+
else:
|
|
32
|
+
print("Not a valid ZIP file - content may be truncated")
|
|
33
|
+
|
|
34
|
+
except Exception as e:
|
|
35
|
+
print(f"Error: {e}")
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Test script to verify extraction works with known base64 content
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import base64
|
|
7
|
+
import tempfile
|
|
8
|
+
import zipfile
|
|
9
|
+
import re
|
|
10
|
+
from xml.etree import ElementTree as ET
|
|
11
|
+
|
|
12
|
+
WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
|
|
13
|
+
|
|
14
|
+
def extract_text_from_xml(xml_content: str) -> str:
|
|
15
|
+
try:
|
|
16
|
+
root = ET.fromstring(xml_content)
|
|
17
|
+
except ET.ParseError:
|
|
18
|
+
texts = re.findall(r'<w:t[^>]*>([^<]*)</w:t>', xml_content)
|
|
19
|
+
return ' '.join(texts)
|
|
20
|
+
|
|
21
|
+
texts = []
|
|
22
|
+
for elem in root.iter():
|
|
23
|
+
if elem.tag == f'{WORD_NAMESPACE}t':
|
|
24
|
+
if elem.text:
|
|
25
|
+
texts.append(elem.text)
|
|
26
|
+
elif elem.tag == f'{WORD_NAMESPACE}p':
|
|
27
|
+
if texts and not texts[-1].endswith('\n'):
|
|
28
|
+
texts.append('\n')
|
|
29
|
+
elif elem.tag == f'{WORD_NAMESPACE}br':
|
|
30
|
+
texts.append('\n')
|
|
31
|
+
|
|
32
|
+
text = ''.join(texts)
|
|
33
|
+
text = re.sub(r'[ \t]+', ' ', text)
|
|
34
|
+
text = re.sub(r'\n\s*\n', '\n\n', text)
|
|
35
|
+
return text.strip()
|
|
36
|
+
|
|
37
|
+
# Read test base64 from file
|
|
38
|
+
TEST_FILE = ".claude/temp/full_base64.txt"
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
with open(TEST_FILE, 'r') as f:
|
|
42
|
+
base64_content = f.read().strip()
|
|
43
|
+
|
|
44
|
+
print(f"Read {len(base64_content)} chars of base64")
|
|
45
|
+
|
|
46
|
+
docx_bytes = base64.b64decode(base64_content)
|
|
47
|
+
print(f"Decoded to {len(docx_bytes)} bytes")
|
|
48
|
+
|
|
49
|
+
# Save temporarily and check structure
|
|
50
|
+
with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as tmp:
|
|
51
|
+
tmp.write(docx_bytes)
|
|
52
|
+
tmp_path = tmp.name
|
|
53
|
+
|
|
54
|
+
if zipfile.is_zipfile(tmp_path):
|
|
55
|
+
print("Valid ZIP/DOCX structure")
|
|
56
|
+
with zipfile.ZipFile(tmp_path, 'r') as z:
|
|
57
|
+
files = z.namelist()
|
|
58
|
+
print(f"Contains {len(files)} files")
|
|
59
|
+
if 'word/document.xml' in files:
|
|
60
|
+
xml_content = z.read('word/document.xml').decode('utf-8')
|
|
61
|
+
print(f"document.xml: {len(xml_content)} chars")
|
|
62
|
+
text = extract_text_from_xml(xml_content)
|
|
63
|
+
print(f"\nExtracted text ({len(text)} chars, {len(text.split())} words):")
|
|
64
|
+
print("="*60)
|
|
65
|
+
print(text[:2000] if len(text) > 2000 else text)
|
|
66
|
+
print("="*60)
|
|
67
|
+
else:
|
|
68
|
+
print("Not a valid ZIP file")
|
|
69
|
+
|
|
70
|
+
except FileNotFoundError:
|
|
71
|
+
print(f"Test file not found: {TEST_FILE}")
|
|
72
|
+
print("Please save base64 content to this file first")
|
|
73
|
+
except Exception as e:
|
|
74
|
+
print(f"Error: {e}")
|