monoco-toolkit 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. monoco/__main__.py +8 -0
  2. monoco/core/artifacts/__init__.py +16 -0
  3. monoco/core/artifacts/manager.py +575 -0
  4. monoco/core/artifacts/models.py +161 -0
  5. monoco/core/config.py +38 -4
  6. monoco/core/git.py +23 -0
  7. monoco/core/hooks/builtin/git_cleanup.py +1 -1
  8. monoco/core/ingestion/__init__.py +20 -0
  9. monoco/core/ingestion/discovery.py +248 -0
  10. monoco/core/ingestion/watcher.py +343 -0
  11. monoco/core/ingestion/worker.py +436 -0
  12. monoco/core/injection.py +63 -29
  13. monoco/core/integrations.py +2 -2
  14. monoco/core/loader.py +633 -0
  15. monoco/core/output.py +5 -5
  16. monoco/core/registry.py +34 -19
  17. monoco/core/resource/__init__.py +5 -0
  18. monoco/core/resource/finder.py +98 -0
  19. monoco/core/resource/manager.py +91 -0
  20. monoco/core/resource/models.py +35 -0
  21. monoco/core/skill_framework.py +292 -0
  22. monoco/core/skills.py +524 -385
  23. monoco/core/sync.py +73 -1
  24. monoco/core/workflow_converter.py +420 -0
  25. monoco/daemon/app.py +77 -1
  26. monoco/daemon/commands.py +10 -0
  27. monoco/daemon/mailroom_service.py +196 -0
  28. monoco/daemon/models.py +1 -0
  29. monoco/daemon/scheduler.py +236 -0
  30. monoco/daemon/services.py +185 -0
  31. monoco/daemon/triggers.py +55 -0
  32. monoco/features/agent/__init__.py +2 -2
  33. monoco/features/agent/adapter.py +41 -0
  34. monoco/features/agent/apoptosis.py +44 -0
  35. monoco/features/agent/cli.py +101 -144
  36. monoco/features/agent/config.py +35 -21
  37. monoco/features/agent/defaults.py +6 -49
  38. monoco/features/agent/engines.py +32 -6
  39. monoco/features/agent/manager.py +47 -6
  40. monoco/features/agent/models.py +2 -2
  41. monoco/features/agent/resources/atoms/atom-code-dev.yaml +61 -0
  42. monoco/features/agent/resources/atoms/atom-issue-lifecycle.yaml +73 -0
  43. monoco/features/agent/resources/atoms/atom-knowledge.yaml +55 -0
  44. monoco/features/agent/resources/atoms/atom-review.yaml +60 -0
  45. monoco/{core/resources/en → features/agent/resources/en/skills/monoco_atom_core}/SKILL.md +3 -1
  46. monoco/features/agent/resources/en/skills/monoco_workflow_agent_engineer/SKILL.md +94 -0
  47. monoco/features/agent/resources/en/skills/monoco_workflow_agent_manager/SKILL.md +93 -0
  48. monoco/features/agent/resources/en/skills/monoco_workflow_agent_planner/SKILL.md +85 -0
  49. monoco/features/agent/resources/en/skills/monoco_workflow_agent_reviewer/SKILL.md +114 -0
  50. monoco/features/agent/resources/workflows/workflow-dev.yaml +83 -0
  51. monoco/features/agent/resources/workflows/workflow-issue-create.yaml +72 -0
  52. monoco/features/agent/resources/workflows/workflow-review.yaml +94 -0
  53. monoco/features/agent/resources/zh/roles/monoco_role_engineer.yaml +49 -0
  54. monoco/features/agent/resources/zh/roles/monoco_role_manager.yaml +46 -0
  55. monoco/features/agent/resources/zh/roles/monoco_role_planner.yaml +46 -0
  56. monoco/features/agent/resources/zh/roles/monoco_role_reviewer.yaml +47 -0
  57. monoco/{core/resources/zh → features/agent/resources/zh/skills/monoco_atom_core}/SKILL.md +3 -1
  58. monoco/features/agent/resources/{skills/flow_engineer → zh/skills/monoco_workflow_agent_engineer}/SKILL.md +2 -2
  59. monoco/features/agent/resources/{skills/flow_manager → zh/skills/monoco_workflow_agent_manager}/SKILL.md +2 -2
  60. monoco/features/agent/resources/zh/skills/monoco_workflow_agent_planner/SKILL.md +259 -0
  61. monoco/features/agent/resources/zh/skills/monoco_workflow_agent_reviewer/SKILL.md +137 -0
  62. monoco/features/agent/session.py +59 -11
  63. monoco/features/agent/worker.py +38 -2
  64. monoco/features/artifact/__init__.py +0 -0
  65. monoco/features/artifact/adapter.py +33 -0
  66. monoco/features/artifact/resources/zh/AGENTS.md +14 -0
  67. monoco/features/artifact/resources/zh/skills/monoco_atom_artifact/SKILL.md +278 -0
  68. monoco/features/glossary/__init__.py +0 -0
  69. monoco/features/glossary/adapter.py +42 -0
  70. monoco/features/glossary/config.py +5 -0
  71. monoco/features/glossary/resources/en/AGENTS.md +29 -0
  72. monoco/features/glossary/resources/en/skills/monoco_atom_glossary/SKILL.md +35 -0
  73. monoco/features/glossary/resources/zh/AGENTS.md +29 -0
  74. monoco/features/glossary/resources/zh/skills/monoco_atom_glossary/SKILL.md +35 -0
  75. monoco/features/hooks/__init__.py +11 -0
  76. monoco/features/hooks/adapter.py +67 -0
  77. monoco/features/hooks/commands.py +309 -0
  78. monoco/features/hooks/core.py +441 -0
  79. monoco/features/hooks/resources/ADDING_HOOKS.md +234 -0
  80. monoco/features/i18n/adapter.py +18 -5
  81. monoco/features/i18n/core.py +482 -17
  82. monoco/features/i18n/resources/en/{SKILL.md → skills/monoco_atom_i18n/SKILL.md} +3 -1
  83. monoco/features/i18n/resources/en/skills/monoco_workflow_i18n_scan/SKILL.md +105 -0
  84. monoco/features/i18n/resources/zh/{SKILL.md → skills/monoco_atom_i18n/SKILL.md} +3 -1
  85. monoco/features/i18n/resources/{skills/i18n_scan_workflow → zh/skills/monoco_workflow_i18n_scan}/SKILL.md +2 -2
  86. monoco/features/issue/adapter.py +19 -6
  87. monoco/features/issue/commands.py +281 -7
  88. monoco/features/issue/core.py +272 -19
  89. monoco/features/issue/engine/machine.py +118 -5
  90. monoco/features/issue/linter.py +60 -5
  91. monoco/features/issue/models.py +3 -2
  92. monoco/features/issue/resources/en/AGENTS.md +109 -0
  93. monoco/features/issue/resources/en/{SKILL.md → skills/monoco_atom_issue/SKILL.md} +3 -1
  94. monoco/features/issue/resources/en/skills/monoco_workflow_issue_creation/SKILL.md +167 -0
  95. monoco/features/issue/resources/en/skills/monoco_workflow_issue_development/SKILL.md +224 -0
  96. monoco/features/issue/resources/en/skills/monoco_workflow_issue_management/SKILL.md +159 -0
  97. monoco/features/issue/resources/en/skills/monoco_workflow_issue_refinement/SKILL.md +203 -0
  98. monoco/features/issue/resources/hooks/post-checkout.sh +39 -0
  99. monoco/features/issue/resources/hooks/pre-commit.sh +41 -0
  100. monoco/features/issue/resources/hooks/pre-push.sh +35 -0
  101. monoco/features/issue/resources/zh/AGENTS.md +109 -0
  102. monoco/features/issue/resources/zh/{SKILL.md → skills/monoco_atom_issue_lifecycle/SKILL.md} +3 -1
  103. monoco/features/issue/resources/zh/skills/monoco_workflow_issue_creation/SKILL.md +167 -0
  104. monoco/features/issue/resources/zh/skills/monoco_workflow_issue_development/SKILL.md +224 -0
  105. monoco/features/issue/resources/{skills/issue_lifecycle_workflow → zh/skills/monoco_workflow_issue_management}/SKILL.md +2 -2
  106. monoco/features/issue/resources/zh/skills/monoco_workflow_issue_refinement/SKILL.md +203 -0
  107. monoco/features/issue/validator.py +101 -1
  108. monoco/features/memo/adapter.py +21 -8
  109. monoco/features/memo/cli.py +103 -10
  110. monoco/features/memo/core.py +178 -92
  111. monoco/features/memo/models.py +53 -0
  112. monoco/features/memo/resources/en/skills/monoco_atom_memo/SKILL.md +77 -0
  113. monoco/features/memo/resources/en/skills/monoco_workflow_note_processing/SKILL.md +140 -0
  114. monoco/features/memo/resources/zh/{SKILL.md → skills/monoco_atom_memo/SKILL.md} +3 -1
  115. monoco/features/memo/resources/{skills/note_processing_workflow → zh/skills/monoco_workflow_note_processing}/SKILL.md +2 -2
  116. monoco/features/spike/adapter.py +18 -5
  117. monoco/features/spike/resources/en/{SKILL.md → skills/monoco_atom_spike/SKILL.md} +3 -1
  118. monoco/features/spike/resources/en/skills/monoco_workflow_research/SKILL.md +121 -0
  119. monoco/features/spike/resources/zh/{SKILL.md → skills/monoco_atom_spike/SKILL.md} +3 -1
  120. monoco/features/spike/resources/{skills/research_workflow → zh/skills/monoco_workflow_research}/SKILL.md +2 -2
  121. monoco/main.py +38 -1
  122. monoco_toolkit-0.3.11.dist-info/METADATA +130 -0
  123. monoco_toolkit-0.3.11.dist-info/RECORD +181 -0
  124. monoco/features/agent/reliability.py +0 -106
  125. monoco/features/agent/resources/skills/flow_reviewer/SKILL.md +0 -114
  126. monoco_toolkit-0.3.9.dist-info/METADATA +0 -127
  127. monoco_toolkit-0.3.9.dist-info/RECORD +0 -115
  128. /monoco/{core → features/agent}/resources/en/AGENTS.md +0 -0
  129. /monoco/{core → features/agent}/resources/zh/AGENTS.md +0 -0
  130. {monoco_toolkit-0.3.9.dist-info → monoco_toolkit-0.3.11.dist-info}/WHEEL +0 -0
  131. {monoco_toolkit-0.3.9.dist-info → monoco_toolkit-0.3.11.dist-info}/entry_points.txt +0 -0
  132. {monoco_toolkit-0.3.9.dist-info → monoco_toolkit-0.3.11.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,8 @@
1
1
  import fnmatch
2
2
  from pathlib import Path
3
- from typing import List, Optional
3
+ from typing import List, Optional, Tuple
4
+ from enum import Enum
5
+ from dataclasses import dataclass
4
6
  import re
5
7
 
6
8
  DEFAULT_EXCLUDES = [
@@ -214,43 +216,506 @@ def check_translation_exists(
214
216
  return missing
215
217
 
216
218
 
219
+ # Common technical terms that should not count as "English words"
220
+ # when detecting language in Chinese documents
221
+ TECHNICAL_TERMS_ALLOWLIST = {
222
+ # CLI/Shell
223
+ "cli", "api", "ui", "ux", "gui", "cli", "shell", "bash", "zsh", "sh",
224
+ "cmd", "powershell", "terminal", "console", "prompt",
225
+ # Cloud/Container
226
+ "kubernetes", "k8s", "docker", "container", "pod", "cluster", "node",
227
+ "namespace", "ingress", "service", "deployment", "helm", "kubectl",
228
+ "aws", "gcp", "azure", "cloud", "serverless", "lambda", "ec2", "s3",
229
+ # DevOps/CI/CD
230
+ "ci", "cd", "cicd", "pipeline", "jenkins", "gitlab", "github", "git",
231
+ "svn", "mercurial", "hg", "commit", "branch", "merge", "rebase", "tag",
232
+ "hook", "action", "workflow", "artifact", "build", "deploy", "release",
233
+ # Programming Languages
234
+ "python", "javascript", "js", "typescript", "ts", "java", "kotlin",
235
+ "scala", "groovy", "ruby", "go", "golang", "rust", "c", "cpp", "c++",
236
+ "csharp", "c#", "php", "perl", "lua", "swift", "objc", "objective-c",
237
+ "r", "matlab", "julia", "dart", "flutter", "elixir", "erlang", "haskell",
238
+ "clojure", "lisp", "scheme", "racket", "fsharp", "f#", "vb", "vba",
239
+ # Web/Frameworks
240
+ "html", "css", "scss", "sass", "less", "xml", "json", "yaml", "yml",
241
+ "toml", "ini", "csv", "tsv", "markdown", "md", "rst", "asciidoc",
242
+ "react", "vue", "angular", "svelte", "nextjs", "nuxt", "django",
243
+ "flask", "fastapi", "tornado", "express", "koa", "nestjs", "spring",
244
+ "rails", "laravel", "symfony", "dotnet", "aspnet", "mvc", "mvvm",
245
+ # Databases
246
+ "sql", "nosql", "mysql", "postgresql", "postgres", "sqlite", "oracle",
247
+ "mssql", "sqlserver", "mongodb", "mongo", "redis", "cassandra",
248
+ "dynamodb", "firebase", "elasticsearch", "solr", "neo4j", "graphql",
249
+ # Testing
250
+ "test", "testing", "unittest", "pytest", "jest", "mocha", "jasmine",
251
+ "cypress", "selenium", "cucumber", "bdd", "tdd", "mock", "stub",
252
+ "fixture", "assertion", "coverage", "benchmark", "profiling",
253
+ # Architecture/Patterns
254
+ "microservice", "microservices", "monolith", "server", "client",
255
+ "frontend", "backend", "fullstack", "api-gateway", "load-balancer",
256
+ "proxy", "cache", "cdn", "dns", "http", "https", "tcp", "udp",
257
+ "websocket", "grpc", "rest", "soap", "graphql", "oauth", "jwt",
258
+ "sso", "ldap", "auth", "authentication", "authorization",
259
+ # OS/Platform
260
+ "linux", "ubuntu", "debian", "centos", "rhel", "fedora", "arch",
261
+ "alpine", "windows", "macos", "darwin", "ios", "android",
262
+ "unix", "posix", "kernel", "systemd", "init", "daemon",
263
+ # Tools/IDE
264
+ "vscode", "idea", "pycharm", "webstorm", "vim", "neovim", "nvim",
265
+ "emacs", "sublime", "atom", "eclipse", "netbeans", "xcode",
266
+ "docker-compose", "dockerfile", "makefile", "cmake", "gradle",
267
+ "maven", "npm", "yarn", "pnpm", "pip", "conda", "venv", "virtualenv",
268
+ # AI/ML
269
+ "ai", "ml", "dl", "llm", "nlp", "cv", "neural", "network",
270
+ "tensorflow", "pytorch", "keras", "scikit", "sklearn", "pandas",
271
+ "numpy", "scipy", "matplotlib", "seaborn", "jupyter", "notebook",
272
+ "training", "inference", "model", "dataset", "vector", "embedding",
273
+ # Security
274
+ "security", "vulnerability", "exploit", "cve", "xss", "csrf",
275
+ "injection", "encryption", "decryption", "hash", "signature",
276
+ "certificate", "ssl", "tls", "https", "firewall", "vpn",
277
+ # Monitoring/Observability
278
+ "log", "logging", "metrics", "tracing", "observability", "monitoring",
279
+ "alert", "dashboard", "grafana", "prometheus", "elk", "splunk",
280
+ "datadog", "newrelic", "sentry", "bugsnag", "rollbar",
281
+ # Agile/Project Management
282
+ "agile", "scrum", "kanban", "sprint", "backlog", "epic", "story",
283
+ "task", "issue", "ticket", "bug", "feature", "milestone", "roadmap",
284
+ "retro", "standup", "review", "demo", "po", "sm", "pm",
285
+ # Misc Tech Terms
286
+ "id", "uuid", "guid", "url", "uri", "ip", "ipv4", "ipv6",
287
+ "mac", "hostname", "domain", "subdomain", "path", "query",
288
+ "header", "body", "payload", "request", "response", "status",
289
+ "error", "exception", "warning", "info", "debug", "trace",
290
+ "config", "configuration", "setting", "option", "flag", "env",
291
+ "variable", "constant", "literal", "expression", "statement",
292
+ "function", "method", "class", "object", "instance", "interface",
293
+ "abstract", "virtual", "override", "inherit", "extend", "implement",
294
+ "import", "export", "module", "package", "library", "framework",
295
+ "sdk", "toolkit", "runtime", "compiler", "interpreter", "vm",
296
+ "version", "release", "changelog", "license", "copyright",
297
+ "repo", "repository", "fork", "clone", "pull", "push", "fetch",
298
+ "upstream", "origin", "remote", "local", "stash", "stage",
299
+ "index", "working", "tree", "head", "detached", "orphan",
300
+ "squash", "amend", "cherry-pick", "revert", "reset", "clean",
301
+ "linter", "formatter", "parser", "lexer", "ast", "ir",
302
+ "bytecode", "opcode", "assembly", "binary", "executable",
303
+ "static", "dynamic", "linking", "compilation", "transpilation",
304
+ "minification", "bundling", "tree-shaking", "code-splitting",
305
+ "hot-reload", "hot-restart", "live-reload", "watch", "watchman",
306
+ "polyfill", "shim", "ponyfill", "fallback", "graceful",
307
+ "async", "sync", "parallel", "concurrent", "sequential",
308
+ "blocking", "non-blocking", "io", "nio", "epoll", "kqueue",
309
+ "thread", "process", "coroutine", "fiber", "goroutine",
310
+ "mutex", "lock", "semaphore", "channel", "queue", "stack",
311
+ "heap", "gc", "garbage", "collection", "memory", "leak",
312
+ "buffer", "stream", "pipe", "redirect", "tee", "cat",
313
+ "grep", "awk", "sed", "cut", "sort", "uniq", "wc", "head", "tail",
314
+ "find", "locate", "which", "whereis", "type", "alias",
315
+ "export", "source", "env", "printenv", "set", "unset",
316
+ "chmod", "chown", "chgrp", "umask", "sudo", "su",
317
+ "ssh", "scp", "sftp", "rsync", "ftp", "telnet", "nc",
318
+ "ping", "traceroute", "netstat", "ss", "lsof", "fuser",
319
+ "ps", "top", "htop", "kill", "pkill", "killall", "nice",
320
+ "cron", "at", "batch", "systemctl", "service", "init",
321
+ "mount", "umount", "df", "du", "fsck", "mkfs", "fdisk",
322
+ "parted", "lsblk", "blkid", "uuidgen", "tune2fs",
323
+ "tar", "gzip", "gunzip", "zip", "unzip", "bz2", "xz",
324
+ "7z", "rar", "archive", "compress", "decompress", "extract",
325
+ "curl", "wget", "httpie", "postman", "insomnia",
326
+ "nginx", "apache", "httpd", "tomcat", "jetty", "undertow",
327
+ "haproxy", "traefik", "envoy", "istio", "linkerd",
328
+ "rabbitmq", "kafka", "mqtt", "amqp", "stomp", "zeromq",
329
+ "memcached", "etcd", "consul", "vault", "zookeeper",
330
+ "prometheus", "grafana", "jaeger", "zipkin", "opentelemetry",
331
+ "ansible", "puppet", "chef", "saltstack", "terraform",
332
+ "pulumi", "vagrant", "packer", "nomad", "consul-template",
333
+ "github-actions", "gitlab-ci", "travis", "circleci", "jenkins",
334
+ "teamcity", "bamboo", "drone", "argo", "tekton", "spinnaker",
335
+ "sonarqube", "nexus", "artifactory", "harbor", "chartmuseum",
336
+ "loki", "fluentd", "fluent-bit", "vector", "filebeat",
337
+ "telegraf", "influxdb", "timescaledb", "promscale",
338
+ "minio", "ceph", "glusterfs", "nfs", "smb", "cifs",
339
+ "vpn", "wireguard", "openvpn", "ipsec", "ssl-vpn",
340
+ "waf", "ids", "ips", "siem", "soar", "xdr", "edr",
341
+ "ldap", "ad", "sso", "saml", "oauth2", "openid", "oidc",
342
+ "mfa", "2fa", "totp", "hotp", "u2f", "webauthn", "fido",
343
+ "aes", "rsa", "ecc", "dsa", "ecdsa", "ed25519", "curve25519",
344
+ "sha", "md5", "bcrypt", "scrypt", "argon2", "pbkdf2",
345
+ "hmac", "cmac", "gcm", "cbc", "ecb", "ctr", "ofb", "cfb",
346
+ "tls", "ssl", "x509", "csr", "crt", "pem", "der", "p12", "pfx",
347
+ "acme", "letsencrypt", "certbot", "traefik", "caddy",
348
+ "wasm", "webassembly", "wasmer", "wasmtime", "wasi",
349
+ "pwa", "spa", "mpa", "ssr", "csr", "ssg", "isr",
350
+ "amp", "instant", "turbo", "stimulus", "alpine", "htmx",
351
+ "webcomponents", "shadow", "dom", "custom", "elements",
352
+ "service-worker", "pwa", "manifest", "offline", "cache",
353
+ "webrtc", "websocket", "sse", "eventsource", "polling",
354
+ "graphql", "subscription", "mutation", "query", "schema",
355
+ "resolver", "directive", "fragment", "interface", "union",
356
+ "prisma", "sequelize", "typeorm", "sqlalchemy", "orm",
357
+ "migration", "seed", "factory", "fixture", "mock", "stub",
358
+ "faker", "factory-boy", "hypothesis", "property-based",
359
+ "snapshot", "visual", "regression", "e2e", "integration",
360
+ "unit", "functional", "acceptance", "performance", "load",
361
+ "stress", "chaos", "contract", "pact", "consumer", "provider",
362
+ "tdd", "bdd", "atdd", "sbe", "example", "specification",
363
+ "given", "when", "then", "scenario", "feature", "background",
364
+ "cucumber", "behave", "specflow", "gauge", "relish",
365
+ "allure", "reportportal", "xunit", "nunit", "mstest",
366
+ "sonar", "coveralls", "codecov", "codeclimate", "codacy",
367
+ "deepsource", "snyk", "whitesource", "blackduck", "fossa",
368
+ "dependabot", "renovate", "snyk", "greenkeeper",
369
+ "pre-commit", "husky", "lint-staged", "commitlint",
370
+ "semantic-release", "standard-version", "conventional",
371
+ "changelog", "commitizen", "cz", "commitlint",
372
+ "monoco", "kimi", "claude", "gemini", "qwen", "gpt",
373
+ }
374
+
375
+
217
376
  def detect_language(content: str) -> str:
218
377
  """
219
- Detect the language of the content using simple heuristics.
378
+ Detect the language of the content using improved heuristics.
379
+
380
+ This function is designed to handle technical documents with mixed
381
+ Chinese and English content, especially for IT/Software development topics.
382
+
220
383
  Returns: 'zh', 'en', or 'unknown'
221
384
  """
222
385
  if not content:
223
386
  return "unknown"
224
387
 
225
388
  # Strip YAML Frontmatter if present
226
- # Matches --- at start, followed by anything, followed by ---
227
389
  frontmatter_pattern = re.compile(r"^---\n.*?\n---\n", re.DOTALL)
228
390
  content = frontmatter_pattern.sub("", content)
229
391
 
230
392
  if not content.strip():
231
393
  return "unknown"
232
394
 
233
- # 1. Check for CJK characters (Chinese/Japanese/Korean)
234
- # Range: \u4e00-\u9fff (Common CJK Unified Ideographs)
235
- # Heuristic: If CJK count > threshold, it's likely Asian (we assume ZH for now in this context)
236
- total_chars = len(content)
237
- cjk_count = sum(1 for c in content if "\u4e00" <= c <= "\u9fff")
395
+ # Remove code blocks (```...```) as they often contain English keywords
396
+ code_block_pattern = re.compile(r"```[\s\S]*?```", re.MULTILINE)
397
+ content_no_code = code_block_pattern.sub("", content)
398
+
399
+ # Remove inline code (`...`)
400
+ inline_code_pattern = re.compile(r"`[^`]+`")
401
+ content_no_code = inline_code_pattern.sub("", content_no_code)
402
+
403
+ # Remove URLs
404
+ url_pattern = re.compile(r"https?://\S+|www\.\S+|")
405
+ content_clean = url_pattern.sub("", content_no_code)
406
+
407
+ # Remove issue IDs (EPIC-0001, FEAT-1234, etc.)
408
+ issue_id_pattern = re.compile(r"\b(EPIC|FEAT|CHORE|FIX)-\d{4}\b")
409
+ content_clean = issue_id_pattern.sub("", content_clean)
410
+
411
+ if not content_clean.strip():
412
+ # If after cleaning there's nothing left, it was likely all code/IDs
413
+ return "unknown"
238
414
 
239
- # If > 5% chars are CJK, highly likely to be Chinese document
240
- if total_chars > 0 and cjk_count / total_chars > 0.05:
415
+ total_chars = len(content_clean)
416
+
417
+ # Count CJK characters (Chinese/Japanese/Korean)
418
+ cjk_count = sum(1 for c in content_clean if "\u4e00" <= c <= "\u9fff")
419
+
420
+ # Count non-ASCII characters (excluding CJK)
421
+ non_ascii_non_cjk = sum(
422
+ 1 for c in content_clean
423
+ if ord(c) > 127 and not ("\u4e00" <= c <= "\u9fff")
424
+ )
425
+
426
+ # Extract words for analysis (alphanumeric sequences)
427
+ words = re.findall(r"\b[a-zA-Z][a-zA-Z0-9]*\b", content_clean)
428
+ total_words = len(words)
429
+
430
+ # Count technical terms in allowlist (case-insensitive)
431
+ technical_term_count = sum(
432
+ 1 for word in words
433
+ if word.lower() in TECHNICAL_TERMS_ALLOWLIST
434
+ )
435
+
436
+ # Calculate non-technical English words
437
+ non_technical_words = total_words - technical_term_count
438
+
439
+ # Heuristic 1: If > 3% chars are CJK, likely Chinese document
440
+ # Lowered threshold from 5% to 3% for better Chinese detection
441
+ cjk_ratio = cjk_count / total_chars if total_chars > 0 else 0
442
+ if cjk_ratio > 0.03:
241
443
  return "zh"
242
-
243
- # 2. Check for English
244
- # Heuristic: High ASCII ratio and low CJK
245
- non_ascii = sum(1 for c in content if ord(c) > 127)
246
-
247
- # If < 10% non-ASCII, likely English (or code)
248
- if total_chars > 0 and non_ascii / total_chars < 0.1:
444
+
445
+ # Heuristic 2: If significant CJK (>1%) and some English technical terms,
446
+ # treat as Chinese (technical Chinese document)
447
+ if cjk_ratio > 0.01 and technical_term_count > 0:
448
+ return "zh"
449
+
450
+ # Heuristic 3: For English detection
451
+ # Only count non-technical English words towards English detection
452
+ # Require at least 10 non-technical words to be considered English
453
+ non_ascii_ratio = non_ascii_non_cjk / total_chars if total_chars > 0 else 0
454
+
455
+ # Relaxed threshold: < 15% non-ASCII (excluding CJK) AND
456
+ # has meaningful non-technical English content
457
+ if non_ascii_ratio < 0.15 and non_technical_words >= 10:
458
+ return "en"
459
+
460
+ # Heuristic 4: High English word density with low CJK
461
+ if cjk_ratio < 0.01 and total_words > 20:
249
462
  return "en"
250
463
 
251
464
  return "unknown"
252
465
 
253
466
 
467
+ class BlockType(Enum):
468
+ """Types of content blocks in Markdown."""
469
+ HEADING = "heading"
470
+ PARAGRAPH = "paragraph"
471
+ CODE_BLOCK = "code_block"
472
+ LIST_ITEM = "list_item"
473
+ QUOTE = "quote"
474
+ TABLE = "table"
475
+ EMPTY = "empty"
476
+
477
+
478
+ @dataclass
479
+ class ContentBlock:
480
+ """Represents a block of content with its type and language info."""
481
+ type: BlockType
482
+ content: str
483
+ line_start: int
484
+ line_end: int
485
+ detected_lang: str = "unknown"
486
+ should_skip: bool = False
487
+
488
+
489
+ def parse_markdown_blocks(content: str) -> List[ContentBlock]:
490
+ """
491
+ Parse markdown content into blocks for language detection.
492
+
493
+ This function respects block boundaries like:
494
+ - Code blocks (```...```)
495
+ - Headings (# ...)
496
+ - Paragraphs
497
+ - List items
498
+
499
+ Returns a list of ContentBlock objects.
500
+ """
501
+ # Strip YAML Frontmatter if present
502
+ frontmatter_pattern = re.compile(r"^---\n.*?\n---\n", re.DOTALL)
503
+ content_without_fm = frontmatter_pattern.sub("", content)
504
+
505
+ lines = content_without_fm.splitlines()
506
+ blocks = []
507
+ current_block_lines = []
508
+ current_block_type = BlockType.PARAGRAPH
509
+ current_start_line = 0
510
+ in_code_block = False
511
+ code_block_lang = ""
512
+
513
+ def flush_block():
514
+ nonlocal current_block_lines, current_start_line
515
+ if current_block_lines:
516
+ content = "\n".join(current_block_lines)
517
+ block = ContentBlock(
518
+ type=current_block_type,
519
+ content=content,
520
+ line_start=current_start_line,
521
+ line_end=current_start_line + len(current_block_lines),
522
+ )
523
+ blocks.append(block)
524
+ current_block_lines = []
525
+
526
+ for i, line in enumerate(lines):
527
+ # Code block handling
528
+ if line.strip().startswith("```"):
529
+ if not in_code_block:
530
+ # Start of code block
531
+ flush_block()
532
+ in_code_block = True
533
+ code_block_lang = line.strip()[3:].strip()
534
+ current_block_type = BlockType.CODE_BLOCK
535
+ current_start_line = i
536
+ current_block_lines.append(line)
537
+ else:
538
+ # End of code block
539
+ current_block_lines.append(line)
540
+ flush_block()
541
+ in_code_block = False
542
+ current_block_type = BlockType.PARAGRAPH
543
+ continue
544
+
545
+ if in_code_block:
546
+ current_block_lines.append(line)
547
+ continue
548
+
549
+ # Heading
550
+ if re.match(r"^#{1,6}\s", line):
551
+ flush_block()
552
+ block = ContentBlock(
553
+ type=BlockType.HEADING,
554
+ content=line,
555
+ line_start=i,
556
+ line_end=i + 1,
557
+ )
558
+ blocks.append(block)
559
+ current_start_line = i + 1
560
+ current_block_type = BlockType.PARAGRAPH
561
+ continue
562
+
563
+ # Empty line
564
+ if not line.strip():
565
+ flush_block()
566
+ blocks.append(ContentBlock(
567
+ type=BlockType.EMPTY,
568
+ content="",
569
+ line_start=i,
570
+ line_end=i + 1,
571
+ ))
572
+ current_start_line = i + 1
573
+ current_block_type = BlockType.PARAGRAPH
574
+ continue
575
+
576
+ # List item
577
+ if re.match(r"^\s*[-*+]\s", line) or re.match(r"^\s*\d+\.\s", line):
578
+ flush_block()
579
+ current_block_type = BlockType.LIST_ITEM
580
+ current_start_line = i
581
+ current_block_lines.append(line)
582
+ continue
583
+
584
+ # Quote
585
+ if line.strip().startswith(">"):
586
+ flush_block()
587
+ current_block_type = BlockType.QUOTE
588
+ current_start_line = i
589
+ current_block_lines.append(line)
590
+ continue
591
+
592
+ # Table row
593
+ if "|" in line and not line.strip().startswith("#"):
594
+ if current_block_type != BlockType.TABLE:
595
+ flush_block()
596
+ current_block_type = BlockType.TABLE
597
+ current_start_line = i
598
+ current_block_lines.append(line)
599
+ continue
600
+
601
+ # Default: accumulate into paragraph
602
+ if not current_block_lines:
603
+ current_start_line = i
604
+ current_block_lines.append(line)
605
+
606
+ # Flush remaining
607
+ flush_block()
608
+
609
+ return blocks
610
+
611
+
612
+ def should_skip_block_for_language_check(
613
+ block: ContentBlock,
614
+ all_blocks: List[ContentBlock],
615
+ block_index: int,
616
+ source_lang: str = "zh"
617
+ ) -> bool:
618
+ """
619
+ Determine if a block should be skipped during language consistency checks.
620
+
621
+ Design Principle:
622
+ - Narrative text should be in the source language (e.g., Chinese)
623
+ - English should only appear as isolated nouns (technical terms, filenames, code blocks)
624
+
625
+ Reasons to skip:
626
+ 1. Code blocks (always contain English keywords)
627
+ 2. Empty blocks
628
+ 3. Blocks with only technical terms/IDs/filenames
629
+ """
630
+ # Always skip code blocks
631
+ if block.type == BlockType.CODE_BLOCK:
632
+ return True
633
+
634
+ # Skip empty blocks
635
+ if block.type == BlockType.EMPTY:
636
+ return True
637
+
638
+ # Check if block contains only technical content
639
+ content = block.content.strip()
640
+ if not content:
641
+ return True
642
+
643
+ # Remove common non-language elements
644
+ cleaned = content
645
+ # Remove inline code
646
+ cleaned = re.sub(r"`[^`]+`", "", cleaned)
647
+ # Remove URLs
648
+ cleaned = re.sub(r"https?://\S+|www\.\S+", "", cleaned)
649
+ # Remove issue IDs
650
+ cleaned = re.sub(r"\b(EPIC|FEAT|CHORE|FIX)-\d{4}\b", "", cleaned)
651
+ # Remove file paths
652
+ cleaned = re.sub(r"[\w\-]+\.[\w\-]+", "", cleaned)
653
+
654
+ if not cleaned.strip():
655
+ return True
656
+
657
+ return False
658
+
659
+
660
+ def detect_language_blocks(content: str, source_lang: str = "zh") -> List[ContentBlock]:
661
+ """
662
+ Detect language for each block in the content.
663
+
664
+ This provides block-level language detection that respects:
665
+ - Code blocks (skipped)
666
+ - Technical terms (handled by detect_language)
667
+ - Paragraph boundaries
668
+
669
+ Design Principle:
670
+ - Narrative text should be in the source language
671
+ - English should only appear as isolated nouns
672
+
673
+ Returns a list of ContentBlock objects with detected language.
674
+ """
675
+ blocks = parse_markdown_blocks(content)
676
+
677
+ for i, block in enumerate(blocks):
678
+ # Determine if this block should be skipped
679
+ block.should_skip = should_skip_block_for_language_check(
680
+ block, blocks, i, source_lang
681
+ )
682
+
683
+ if block.should_skip:
684
+ block.detected_lang = "unknown"
685
+ continue
686
+
687
+ # Detect language for this specific block
688
+ block.detected_lang = detect_language(block.content)
689
+
690
+ return blocks
691
+
692
+
693
+ def has_language_mismatch_blocks(content: str, source_lang: str = "zh") -> Tuple[bool, List[ContentBlock]]:
694
+ """
695
+ Check if content has language mismatches at block level.
696
+
697
+ Returns:
698
+ (has_mismatch, mismatched_blocks)
699
+ - has_mismatch: True if any non-skipped block has mismatched language
700
+ - mismatched_blocks: List of blocks that don't match source language
701
+ """
702
+ blocks = detect_language_blocks(content, source_lang)
703
+ mismatched = []
704
+
705
+ for block in blocks:
706
+ if block.should_skip or block.detected_lang == "unknown":
707
+ continue
708
+
709
+ if source_lang.lower() in ["zh", "cn"]:
710
+ if block.detected_lang == "en":
711
+ mismatched.append(block)
712
+ elif source_lang.lower() == "en":
713
+ if block.detected_lang == "zh":
714
+ mismatched.append(block)
715
+
716
+ return len(mismatched) > 0, mismatched
717
+
718
+
254
719
  def is_content_source_language(path: Path, source_lang: str = "en") -> bool:
255
720
  """
256
721
  Check if file content appears to be in the source language.
@@ -1,6 +1,8 @@
1
1
  ---
2
- name: monoco-i18n
2
+ name: monoco_atom_i18n
3
3
  description: Internationalization quality control for documentation. Ensures multi-language documentation stays synchronized.
4
+ type: atom
5
+ version: 1.0.0
4
6
  ---
5
7
 
6
8
  # Documentation I18n
@@ -0,0 +1,105 @@
1
+ ---
2
+ name: monoco_workflow_i18n_scan
3
+ description: I18n Scan Workflow (Flow Skill). Defines the standard operational process from scanning missing translations to generating translation tasks, ensuring multilingual documentation quality.
4
+ type: workflow
5
+ domain: i18n
6
+ version: 1.0.0
7
+ ---
8
+
9
+ # I18n Scan Workflow
10
+
11
+ Standardized workflow for I18n scanning, ensuring the "Scan → Identify → Generate Tasks" process.
12
+
13
+ ## Workflow State Machine
14
+
15
+ ```mermaid
16
+ stateDiagram-v2
17
+ [*] --> Scan: Trigger scan
18
+
19
+ Scan --> Identify: Scan completed
20
+ Scan --> Scan: Configuration error<br/>(fix configuration)
21
+
22
+ Identify --> GenerateTasks: Missing found
23
+ Identify --> [*]: No missing<br/>(completed)
24
+
25
+ GenerateTasks --> [*]: Task generation completed
26
+ ```
27
+
28
+ ## Execution Steps
29
+
30
+ ### 1. Scan (Scanning)
31
+
32
+ - **Goal**: Scan all documents in the project, identify translation coverage
33
+ - **Input**: Project files, i18n configuration
34
+ - **Output**: Scan report
35
+ - **Checkpoints**:
36
+ - [ ] Check i18n configuration in `.monoco/config.yaml`
37
+ - [ ] Run `monoco i18n scan`
38
+ - [ ] Confirm source and target language settings are correct
39
+ - [ ] Verify exclusion rules (.gitignore, build directories, etc.)
40
+
41
+ ### 2. Identify (Identify Missing)
42
+
43
+ - **Goal**: Analyze scan results, identify specific missing translations
44
+ - **Strategy**: Compare source and target files
45
+ - **Checkpoints**:
46
+ - [ ] List all source files with missing translations
47
+ - [ ] Identify missing target languages
48
+ - [ ] Assess impact scope of missing translations
49
+ - [ ] Sort by priority (core documents first)
50
+
51
+ ### 3. Generate Tasks (Generate Tasks)
52
+
53
+ - **Goal**: Create tracking tasks for missing translations
54
+ - **Strategy**: Create Issue or memo based on missing status
55
+ - **Checkpoints**:
56
+ - [ ] Create Feature Issue for core document missing translations
57
+ - [ ] Create Memo reminder for secondary document missing translations
58
+ - [ ] Annotate file paths requiring translation in the Issue
59
+ - [ ] Set reasonable priority and deadline
60
+
61
+ ## Decision Branches
62
+
63
+ | Condition | Action |
64
+ |-----------|--------|
65
+ | Configuration error | Fix `.monoco/config.yaml`, rescan |
66
+ | No missing translations | Process completed, no further action needed |
67
+ | Large amount missing | Create Epic, split into multiple Features |
68
+ | Critical document missing | High priority, create Issue immediately |
69
+
70
+ ## Compliance Requirements
71
+
72
+ - **Required**: Verify i18n configuration is correct before scanning
73
+ - **Required**: All core documents must have corresponding translations
74
+ - **Recommended**: Run scans regularly (e.g., weekly)
75
+ - **Recommended**: Bind translation tasks with feature development
76
+
77
+ ## Related Commands
78
+
79
+ ```bash
80
+ # Scan for missing translations
81
+ monoco i18n scan
82
+
83
+ # Create translation task
84
+ monoco issue create feature -t "Translate {filename} to {lang}"
85
+
86
+ # Add memo
87
+ monoco memo add "Needs translation: {filepath}"
88
+ ```
89
+
90
+ ## Output Example
91
+
92
+ After scanning completes, a report like the following should be generated:
93
+
94
+ ```
95
+ I18n Scan Report
96
+ ================
97
+ Source Language: en
98
+ Target Languages: zh, ja
99
+
100
+ Missing Translations:
101
+ - docs/guide.md → zh/guide.md [MISSING]
102
+ - docs/api.md → ja/api.md [MISSING]
103
+
104
+ Coverage: 85%
105
+ ```
@@ -1,6 +1,8 @@
1
1
  ---
2
- name: monoco-i18n
2
+ name: monoco_atom_i18n
3
3
  description: 文档国际化质量控制。确保多语言文档保持同步。
4
+ type: atom
5
+ version: 1.0.0
4
6
  ---
5
7
 
6
8
  # 文档国际化
@@ -1,7 +1,7 @@
1
1
  ---
2
- name: i18n-scan-workflow
2
+ name: monoco_workflow_i18n_scan
3
3
  description: I18n 扫描工作流 (Flow Skill)。定义从扫描缺失翻译到生成翻译任务的标准操作流程,确保多语言文档质量。
4
- type: flow
4
+ type: workflow
5
5
  domain: i18n
6
6
  version: 1.0.0
7
7
  ---