npm - edusquads-cli - Versions diffs - 0.1.0 - Mend

edusquads-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (223) hide show

package/base/scripts/edusquads_concluir_investigacao.py ADDED Viewed

@@ -0,0 +1,356 @@
+#!/usr/bin/env python3
+"""
+Fase 5/6 do /edusquads investigar.
+Conclui uma investigação a partir do run_id:
+- preenche síntese interpretativa com filtros por plataforma
+- preenche extrações úteis por squad
+- calcula score de confiança dos insights
+- aproveita coleta estruturada (quando disponível)
+- registra limites da evidência
+- atualiza status do run para concluído
+"""
+from __future__ import annotations
+import argparse
+import collections
+import json
+import re
+from pathlib import Path
+ROOT = Path(__file__).resolve().parents[2]
+EVID_DIR = ROOT / "_edusquads" / "evidencias"
+RUNS_DIR = ROOT / "_edusquads" / "runs"
+RUNS_INDEX = RUNS_DIR / "RUNS-INDEX.md"
+BASE_STOPWORDS = {
+    "para", "com", "uma", "como", "mais", "sobre", "esta", "esse", "isso", "pela", "pelo",
+    "from", "meta", "instagram", "https", "www", "login", "account", "your", "you", "and", "the",
+    "that", "this", "from", "have", "will", "where", "which", "quando", "depois", "entre", "tambem",
+    "password", "email", "mobile", "create", "forgot", "terms", "privacy", "jobs", "help", "blog",
+}
+PLATFORM_NOISE = {
+    "instagram": {"threads", "verified", "accounts", "facebook", "meta", "moments"},
+    "youtube": {"shorts", "subscribe", "watch", "playlist", "channel"},
+    "linkedin": {"connect", "message", "premium", "hiring", "follow"},
+    "x-twitter": {"retweet", "reply", "quote", "tweet", "tweets"},
+}
+CTA_RE = re.compile(r"\b(comente|clique|saiba|inscreva|assine|dm|direct|link|cadastre|baixe|seguir|salvar|compartilhar|agende|mensagem)\b", re.I)
+FORMAT_RE = re.compile(r"\b(carrossel|reel|reels|video|vídeo|thread|post|posts|live|stories|newsletter|artigo|webinar|aula|corte)\b", re.I)
+def clamp(v: int, lo: int = 0, hi: int = 100) -> int:
+    return max(lo, min(hi, v))
+def find_evidence_file(run_id: str) -> Path:
+    matches = sorted(EVID_DIR.glob(f"EVIDENCIA-{run_id}-*.md"))
+    if not matches:
+        raise SystemExit(f"Nenhuma evidência encontrada para {run_id}")
+    return matches[0]
+def find_run_file(run_id: str) -> Path:
+    p = RUNS_DIR / f"{run_id}.md"
+    if not p.exists():
+        raise SystemExit(f"Run não encontrado: {p}")
+    return p
+def extract_field(pattern: str, text: str, fallback: str = "") -> str:
+    m = re.search(pattern, text)
+    return m.group(1).strip() if m else fallback
+def extract_top_terms(text: str, platform: str, n: int = 8) -> list[str]:
+    words = re.findall(r"[A-Za-zÀ-ÿ]{4,}", text.lower())
+    noise = PLATFORM_NOISE.get(platform, set())
+    words = [w for w in words if w not in BASE_STOPWORDS and w not in noise]
+    counter = collections.Counter(words)
+    return [w for w, _ in counter.most_common(n)]
+def extract_matches(pattern: re.Pattern[str], text: str, limit: int = 8) -> list[str]:
+    found = [m.group(1).lower() for m in pattern.finditer(text)]
+    uniq: list[str] = []
+    for x in found:
+        if x not in uniq:
+            uniq.append(x)
+    return uniq[:limit]
+def load_structured_if_exists(run_id: str) -> dict | None:
+    p = EVID_DIR / f"ESTRUTURADA-{run_id}.json"
+    if not p.exists():
+        return None
+    try:
+        return json.loads(p.read_text(encoding="utf-8"))
+    except Exception:
+        return None
+def compute_confidence(
+    source_text: str,
+    terms: list[str],
+    formats: list[str],
+    ctas: list[str],
+    limitations: str,
+    structured: dict | None,
+) -> dict[str, int]:
+    chars = len(source_text)
+    score_base = 20
+    if chars >= 3000:
+        score_base += 30
+    elif chars >= 1500:
+        score_base += 20
+    elif chars >= 800:
+        score_base += 10
+    else:
+        score_base += 2
+    score_base += min(len(terms) * 4, 20)
+    score_base += min(len(formats) * 6, 18)
+    score_base += min(len(ctas) * 6, 18)
+    low_access_markers = ["login", "sem autenticação", "acesso parcial", "bloqueado", "exigiu", "http 999", "999"]
+    if any(m in limitations.lower() for m in low_access_markers):
+        score_base -= 20
+    # Bônus da coleta estruturada (Fase 6)
+    if structured:
+        completeness = int(structured.get("quality", {}).get("completeness_score", 0))
+        score_base += min(completeness // 8, 10)
+    narrativa = clamp(score_base)
+    formato = clamp(score_base - (10 if not formats else 0))
+    cta = clamp(score_base - (12 if not ctas else 0))
+    geral = clamp(round((narrativa + formato + cta) / 3))
+    return {
+        "geral": geral,
+        "narrativa": narrativa,
+        "formato": formato,
+        "cta": cta,
+    }
+def merge_with_structured(
+    terms: list[str],
+    formats: list[str],
+    ctas: list[str],
+    structured: dict | None,
+) -> tuple[list[str], list[str], list[str], int]:
+    completeness = 0
+    if not structured:
+        return terms, formats, ctas, completeness
+    s = structured.get("signals", {})
+    s_terms = [str(x).lower() for x in s.get("topics", [])]
+    s_formats = [str(x).lower() for x in s.get("formats", [])]
+    s_ctas = [str(x).lower() for x in s.get("ctas", [])]
+    completeness = int(structured.get("quality", {}).get("completeness_score", 0))
+    def merge(a: list[str], b: list[str], limit: int = 10) -> list[str]:
+        out = list(a)
+        for item in b:
+            if item not in out:
+                out.append(item)
+            if len(out) >= limit:
+                break
+        return out
+    return merge(terms, s_terms), merge(formats, s_formats), merge(ctas, s_ctas), completeness
+def build_synthesis(
+    run_id: str,
+    platform: str,
+    source_text: str,
+    target: str,
+    limitations: str,
+) -> tuple[list[str], dict[str, list[str]], dict[str, int], int]:
+    terms = extract_top_terms(source_text, platform=platform)
+    ctas = extract_matches(CTA_RE, source_text)
+    formats = extract_matches(FORMAT_RE, source_text)
+    structured = load_structured_if_exists(run_id)
+    terms, formats, ctas, completeness = merge_with_structured(terms, formats, ctas, structured)
+    conf = compute_confidence(source_text, terms, formats, ctas, limitations, structured)
+    synthesis = [
+        f"Coleta processada para `{target}` com filtro de ruído para {platform}.",
+        f"Termos recorrentes úteis: {', '.join(terms) if terms else 'não identificado (fonte limitada)'}.",
+        f"Formatos detectados: {', '.join(formats) if formats else 'não detectado com confiabilidade'}.",
+        f"Sinais de CTA detectados: {', '.join(ctas) if ctas else 'não detectado com confiabilidade'}.",
+        f"Confiabilidade geral estimada: {conf['geral']}/100.",
+    ]
+    if structured:
+        synthesis.append(f"Coleta estruturada aplicada com completude {completeness}/100.")
+    squads = {
+        "estrategia": [
+            f"Mapear narrativa central usando termos: {', '.join(terms[:4]) if terms else 'dados insuficientes'}.",
+            "Confrontar sinais observados com a tese/posicionamento atual da marca.",
+        ],
+        "conteudo": [
+            f"Priorizar formatos observados: {', '.join(formats[:4]) if formats else 'testar carrossel/reel/post por hipótese'}.",
+            "Montar backlog de 3 temas derivados dos termos recorrentes.",
+        ],
+        "copy": [
+            f"Testar variações de CTA com base em: {', '.join(ctas[:4]) if ctas else 'comente/salve/compartilhe'}.",
+            "Refinar hooks para promessa específica + ganho imediato.",
+        ],
+        "design": [
+            "Traduzir formato dominante em padrão visual consistente por canal.",
+            "Aplicar contraste/hierarquia para leitura rápida no feed.",
+        ],
+    }
+    return synthesis, squads, conf, completeness
+def update_evidence(
+    evidence_path: Path,
+    synthesis: list[str],
+    squads: dict[str, list[str]],
+    source_ref: str,
+    limitations: str,
+    confidence: dict[str, int],
+    completeness: int,
+) -> None:
+    txt = evidence_path.read_text(encoding="utf-8")
+    txt = txt.replace(
+        "- (preencher após investigação de navegação)",
+        "\n".join(f"- {line}" for line in synthesis),
+    )
+    extractions_block = (
+        "## Extrações úteis para squads\n"
+        "- Squad de Estratégia:\n"
+        + "\n".join(f"  - {b}" for b in squads["estrategia"]) + "\n"
+        + "- Squad de Conteúdo:\n"
+        + "\n".join(f"  - {b}" for b in squads["conteudo"]) + "\n"
+        + "- Squad de Copy:\n"
+        + "\n".join(f"  - {b}" for b in squads["copy"]) + "\n"
+        + "- Squad de Design:\n"
+        + "\n".join(f"  - {b}" for b in squads["design"]) + "\n\n"
+        + "## Confiabilidade dos insights\n"
+        + f"- score_geral: {confidence['geral']}/100\n"
+        + f"- score_narrativa: {confidence['narrativa']}/100\n"
+        + f"- score_formato: {confidence['formato']}/100\n"
+        + f"- score_cta: {confidence['cta']}/100\n"
+        + f"- score_completude_coleta_estruturada: {completeness}/100\n\n"
+        + "## Limites da evidência\n"
+        + f"- {limitations}\n"
+        + f"- Fonte usada na síntese: {source_ref}\n"
+    )
+    txt = re.sub(
+        r"## Extrações úteis para squads\n(?:.|\n)*?## Segurança\n",
+        extractions_block + "\n## Segurança\n",
+        txt,
+        flags=re.M,
+    )
+    evidence_path.write_text(txt, encoding="utf-8")
+def update_run(run_path: Path, source_ref: str, confidence: dict[str, int], completeness: int) -> None:
+    txt = run_path.read_text(encoding="utf-8")
+    txt = txt.replace("- status: em-andamento", "- status: concluído")
+    txt = txt.replace(
+        "### Checkpoint de produção\n- resultado: em-andamento\n- observações: investigação web pendente de execução completa.",
+        "### Checkpoint de produção\n- resultado: aprovado\n- observações: investigação executada e evidência preenchida automaticamente.",
+    )
+    txt = re.sub(
+        r"### Checkpoint de qualidade\n- resultado: .*\n- observações: .*",
+        "### Checkpoint de qualidade\n- resultado: aprovado\n"
+        f"- observações: síntese com score de confiança {confidence['geral']}/100 e completude estruturada {completeness}/100.",
+        txt,
+    )
+    txt = txt.replace(
+        "- executar navegação e preencher síntese interpretativa da evidência.\n- atualizar status para concluído ao finalizar a investigação.",
+        "- none.",
+    )
+    marker = "- validações realizadas:\n"
+    if marker in txt and source_ref not in txt:
+        txt = txt.replace(marker, marker + f"  - síntese automática concluída com base em `{source_ref}`\n")
+    txt = txt.replace(str(ROOT).replace('\\', '/').rstrip('/'), "")
+    txt = txt.replace(str(ROOT).replace('/', '\\').rstrip('\\'), "")
+    txt = txt.replace("`/_edusquads/", "`_edusquads/")
+    run_path.write_text(txt, encoding="utf-8")
+def update_runs_index_status(run_id: str, new_status: str = "concluído") -> None:
+    if not RUNS_INDEX.exists():
+        return
+    txt = RUNS_INDEX.read_text(encoding="utf-8")
+    pattern = rf"(\|\s*{re.escape(run_id)}\s*\|[^\n]*\|\s*)(planejado|em-andamento|bloqueado|concluído)(\s*\|[^\n]*\n)"
+    txt = re.sub(pattern, rf"\1{new_status}\3", txt)
+    RUNS_INDEX.write_text(txt, encoding="utf-8")
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Concluir investigação do /edusquads")
+    parser.add_argument("run_id", help="ID do run, ex: RUN-2026-04-11-005")
+    parser.add_argument("--fonte", required=True, help="Arquivo markdown fonte da coleta")
+    parser.add_argument("--limites", default="Coleta parcial ou condicionada por acesso/login da plataforma.")
+    args = parser.parse_args()
+    run_id = args.run_id.strip()
+    run_path = find_run_file(run_id)
+    evidence_path = find_evidence_file(run_id)
+    source_path = Path(args.fonte)
+    if not source_path.is_absolute():
+        source_path = (ROOT / source_path).resolve()
+    if not source_path.exists():
+        raise SystemExit(f"Fonte não encontrada: {source_path}")
+    source_text = source_path.read_text(encoding="utf-8", errors="ignore")
+    ev_text = evidence_path.read_text(encoding="utf-8")
+    target = extract_field(r"- url/perfil/query: (.+)", ev_text, "alvo não identificado")
+    platform = extract_field(r"- plataforma detectada: (.+)", ev_text, "instagram").lower()
+    synthesis, squads, confidence, completeness = build_synthesis(
+        run_id=run_id,
+        platform=platform,
+        source_text=source_text,
+        target=target,
+        limitations=args.limites,
+    )
+    source_ref = source_path.relative_to(ROOT).as_posix()
+    update_evidence(
+        evidence_path=evidence_path,
+        synthesis=synthesis,
+        squads=squads,
+        source_ref=source_ref,
+        limitations=args.limites,
+        confidence=confidence,
+        completeness=completeness,
+    )
+    update_run(run_path=run_path, source_ref=source_ref, confidence=confidence, completeness=completeness)
+    update_runs_index_status(run_id=run_id, new_status="concluído")
+    print(f"run_id={run_id}")
+    print(f"run_file={run_path.relative_to(ROOT).as_posix()}")
+    print(f"evidence_file={evidence_path.relative_to(ROOT).as_posix()}")
+    print(f"source_file={source_ref}")
+    print(f"confidence={confidence['geral']}/100")
+    print(f"structured_completeness={completeness}/100")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

package/base/scripts/edusquads_estruturar_coleta.py ADDED Viewed

@@ -0,0 +1,237 @@
+#!/usr/bin/env python3
+"""
+Fase 6 do /edusquads investigar.
+Estrutura a coleta bruta em um artefato padronizado por plataforma:
+- gera JSON estruturado por run
+- gera resumo markdown para consumo humano
+- calcula score de completude da coleta
+"""
+from __future__ import annotations
+import argparse
+import collections
+import json
+import re
+from pathlib import Path
+ROOT = Path(__file__).resolve().parents[2]
+EVID_DIR = ROOT / "_edusquads" / "evidencias"
+RUNS_DIR = ROOT / "_edusquads" / "runs"
+STOPWORDS = {
+    "para", "com", "uma", "como", "mais", "sobre", "esta", "esse", "isso", "pela", "pelo",
+    "from", "meta", "instagram", "https", "www", "login", "account", "your", "you", "and", "the",
+    "that", "this", "from", "have", "will", "where", "which", "quando", "depois", "entre", "tambem",
+    "password", "email", "mobile", "create", "forgot", "terms", "privacy", "jobs", "help", "blog",
+}
+FORMAT_RE = re.compile(r"\b(carrossel|reel|reels|video|vídeo|thread|post|posts|live|stories|newsletter|artigo|webinar|aula|corte)\b", re.I)
+CTA_RE = re.compile(r"\b(comente|clique|saiba|inscreva|assine|dm|direct|link|cadastre|baixe|seguir|salvar|compartilhar|agende|mensagem|comprar|fale)\b", re.I)
+HASHTAG_RE = re.compile(r"#([A-Za-z0-9_]{2,})")
+URL_RE = re.compile(r"https?://[^\s)]+")
+def clamp(v: int, lo: int = 0, hi: int = 100) -> int:
+    return max(lo, min(hi, v))
+def find_run(run_id: str) -> Path:
+    p = RUNS_DIR / f"{run_id}.md"
+    if not p.exists():
+        raise SystemExit(f"Run não encontrado: {p}")
+    return p
+def find_evidence(run_id: str) -> Path:
+    matches = sorted(EVID_DIR.glob(f"EVIDENCIA-{run_id}-*.md"))
+    if not matches:
+        raise SystemExit(f"Evidência não encontrada para {run_id}")
+    return matches[0]
+def detect_platform_from_evidence(evidence_text: str) -> str:
+    m = re.search(r"- plataforma detectada: (.+)", evidence_text)
+    return m.group(1).strip().lower() if m else "instagram"
+def extract_target(evidence_text: str) -> str:
+    m = re.search(r"- url/perfil/query: (.+)", evidence_text)
+    return m.group(1).strip() if m else "não identificado"
+def uniq(items: list[str], limit: int = 20) -> list[str]:
+    out: list[str] = []
+    for i in items:
+        s = i.strip()
+        if s and s not in out:
+            out.append(s)
+        if len(out) >= limit:
+            break
+    return out
+def extract_top_terms(text: str, limit: int = 12) -> list[str]:
+    words = re.findall(r"[A-Za-zÀ-ÿ]{4,}", text.lower())
+    words = [w for w in words if w not in STOPWORDS]
+    counter = collections.Counter(words)
+    return [w for w, _ in counter.most_common(limit)]
+def platform_fields(platform: str, text: str) -> dict:
+    t = text.lower()
+    if platform == "instagram":
+        return {
+            "bio_detectada": bool(re.search(r"\bbio\b|perfil|creator|digital|marketing", t)),
+            "destaques_detectados": bool(re.search(r"destaques|highlights", t)),
+            "sinal_reels": bool(re.search(r"\breels?\b", t)),
+            "sinal_carrossel": bool(re.search(r"\bcarrossel|slides?\b", t)),
+        }
+    if platform == "youtube":
+        return {
+            "sinal_titulo_video": bool(re.search(r"\bwatch\b|\bvídeo\b|\bvideo\b", t)),
+            "sinal_thumb": bool(re.search(r"\bthumb|thumbnail\b", t)),
+            "sinal_playlist": bool(re.search(r"\bplaylist\b", t)),
+            "sinal_descricao": bool(re.search(r"\bdescrição|description\b", t)),
+        }
+    if platform == "linkedin":
+        return {
+            "sinal_headline": bool(re.search(r"\bheadline\b|\btitle\b|\bcargo\b", t)),
+            "sinal_about": bool(re.search(r"\babout\b|\bsobre\b", t)),
+            "sinal_post": bool(re.search(r"\bpost|publicação|publicacao\b", t)),
+            "sinal_prova_social": bool(re.search(r"\bcomentários|comentarios|recomendações|recomendacoes\b", t)),
+        }
+    if platform == "x-twitter":
+        return {
+            "sinal_thread": bool(re.search(r"\bthread\b", t)),
+            "sinal_reply": bool(re.search(r"\breply|resposta\b", t)),
+            "sinal_retweet": bool(re.search(r"\bretweet\b", t)),
+            "sinal_link_externo": bool(re.search(r"http", t)),
+        }
+    return {"sinal_generico": True}
+def completeness_score(chars: int, formats: list[str], ctas: list[str], links: list[str], pf: dict) -> int:
+    score = 20
+    if chars > 2500:
+        score += 30
+    elif chars > 1200:
+        score += 20
+    elif chars > 600:
+        score += 10
+    score += min(len(formats) * 8, 20)
+    score += min(len(ctas) * 8, 20)
+    score += min(len(links) * 3, 10)
+    score += sum(1 for v in pf.values() if v) * 3
+    return clamp(score)
+def write_outputs(run_id: str, data: dict) -> tuple[Path, Path]:
+    json_path = EVID_DIR / f"ESTRUTURADA-{run_id}.json"
+    md_path = EVID_DIR / f"ESTRUTURADA-{run_id}.md"
+    json_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
+    md = [
+        f"# Coleta Estruturada — {run_id}",
+        "",
+        "## Metadados",
+        f"- plataforma: {data['platform']}",
+        f"- alvo: {data['target']}",
+        f"- caracteres_fonte: {data['source']['chars']}",
+        f"- linhas_fonte: {data['source']['lines']}",
+        "",
+        "## Sinais extraídos",
+        f"- formatos: {', '.join(data['signals']['formats']) if data['signals']['formats'] else 'nenhum'}",
+        f"- ctas: {', '.join(data['signals']['ctas']) if data['signals']['ctas'] else 'nenhum'}",
+        f"- hashtags: {', '.join(data['signals']['hashtags']) if data['signals']['hashtags'] else 'nenhuma'}",
+        f"- links_detectados: {len(data['signals']['links'])}",
+        f"- topicos: {', '.join(data['signals']['topics']) if data['signals']['topics'] else 'nenhum'}",
+        "",
+        "## Campos por plataforma",
+    ]
+    for k, v in data["platform_fields"].items():
+        md.append(f"- {k}: {'sim' if v else 'não'}")
+    md += [
+        "",
+        "## Qualidade da coleta",
+        f"- score_completude: {data['quality']['completeness_score']}/100",
+    ]
+    md_path.write_text("\n".join(md) + "\n", encoding="utf-8")
+    return json_path, md_path
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Estruturar coleta por plataforma")
+    parser.add_argument("run_id", help="ID do run (ex.: RUN-2026-04-11-005)")
+    parser.add_argument("--fonte", required=True, help="Arquivo markdown da coleta bruta")
+    args = parser.parse_args()
+    run_id = args.run_id.strip()
+    find_run(run_id)
+    evidence_path = find_evidence(run_id)
+    evidence_text = evidence_path.read_text(encoding="utf-8")
+    source_path = Path(args.fonte)
+    if not source_path.is_absolute():
+        source_path = (ROOT / source_path).resolve()
+    if not source_path.exists():
+        raise SystemExit(f"Fonte não encontrada: {source_path}")
+    source_text = source_path.read_text(encoding="utf-8", errors="ignore")
+    platform = detect_platform_from_evidence(evidence_text)
+    target = extract_target(evidence_text)
+    formats = uniq([m.group(1).lower() for m in FORMAT_RE.finditer(source_text)], limit=12)
+    ctas = uniq([m.group(1).lower() for m in CTA_RE.finditer(source_text)], limit=12)
+    hashtags = uniq([m.group(1).lower() for m in HASHTAG_RE.finditer(source_text)], limit=30)
+    links = uniq(URL_RE.findall(source_text), limit=100)
+    topics = extract_top_terms(source_text, limit=12)
+    pf = platform_fields(platform, source_text)
+    data = {
+        "run_id": run_id,
+        "platform": platform,
+        "target": target,
+        "source": {
+            "path": source_path.relative_to(ROOT).as_posix(),
+            "chars": len(source_text),
+            "lines": len(source_text.splitlines()),
+        },
+        "signals": {
+            "formats": formats,
+            "ctas": ctas,
+            "hashtags": hashtags,
+            "links": links,
+            "topics": topics,
+        },
+        "platform_fields": pf,
+        "quality": {
+            "completeness_score": completeness_score(len(source_text), formats, ctas, links, pf)
+        },
+    }
+    json_path, md_path = write_outputs(run_id, data)
+    print(f"run_id={run_id}")
+    print(f"platform={platform}")
+    print(f"structured_json={json_path.relative_to(ROOT).as_posix()}")
+    print(f"structured_md={md_path.relative_to(ROOT).as_posix()}")
+    print(f"completeness={data['quality']['completeness_score']}/100")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())