factorio-ai-tools 1.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. factorio_ai_tools-1.1.1/.agents/AGENTS.md +11 -0
  2. factorio_ai_tools-1.1.1/.gitattributes +32 -0
  3. factorio_ai_tools-1.1.1/.github/release-drafter.yml +28 -0
  4. factorio_ai_tools-1.1.1/.github/workflows/docker-publish.yml +46 -0
  5. factorio_ai_tools-1.1.1/.github/workflows/publish-databases.yml +27 -0
  6. factorio_ai_tools-1.1.1/.github/workflows/pypi-publish.yml +27 -0
  7. factorio_ai_tools-1.1.1/.github/workflows/release-drafter.yml +22 -0
  8. factorio_ai_tools-1.1.1/.gitignore +15 -0
  9. factorio_ai_tools-1.1.1/CLAUDE.md +74 -0
  10. factorio_ai_tools-1.1.1/Dockerfile +19 -0
  11. factorio_ai_tools-1.1.1/LICENSE +21 -0
  12. factorio_ai_tools-1.1.1/Makefile +19 -0
  13. factorio_ai_tools-1.1.1/PKG-INFO +137 -0
  14. factorio_ai_tools-1.1.1/README.md +118 -0
  15. factorio_ai_tools-1.1.1/docs/assets/factorio-ai-tools.png +0 -0
  16. factorio_ai_tools-1.1.1/maintenance/compact_lancedb.py +168 -0
  17. factorio_ai_tools-1.1.1/maintenance/hooks/pre-push +42 -0
  18. factorio_ai_tools-1.1.1/make.bat +16 -0
  19. factorio_ai_tools-1.1.1/pyproject.toml +32 -0
  20. factorio_ai_tools-1.1.1/requirements.txt +76 -0
  21. factorio_ai_tools-1.1.1/setup.cfg +4 -0
  22. factorio_ai_tools-1.1.1/src/factorio_ai_tools/__init__.py +1 -0
  23. factorio_ai_tools-1.1.1/src/factorio_ai_tools/ingest/__init__.py +0 -0
  24. factorio_ai_tools-1.1.1/src/factorio_ai_tools/ingest/compact_db.py +27 -0
  25. factorio_ai_tools-1.1.1/src/factorio_ai_tools/ingest/ingest_clusterio.py +0 -0
  26. factorio_ai_tools-1.1.1/src/factorio_ai_tools/ingest/ingest_factorio.py +0 -0
  27. factorio_ai_tools-1.1.1/src/factorio_ai_tools/ingest/ingest_forum.py +0 -0
  28. factorio_ai_tools-1.1.1/src/factorio_ai_tools/ingest/ingest_github_repo.py +263 -0
  29. factorio_ai_tools-1.1.1/src/factorio_ai_tools/ingest/ingest_wiki.py +0 -0
  30. factorio_ai_tools-1.1.1/src/factorio_ai_tools/server.py +544 -0
  31. factorio_ai_tools-1.1.1/src/factorio_ai_tools.egg-info/PKG-INFO +137 -0
  32. factorio_ai_tools-1.1.1/src/factorio_ai_tools.egg-info/SOURCES.txt +36 -0
  33. factorio_ai_tools-1.1.1/src/factorio_ai_tools.egg-info/dependency_links.txt +1 -0
  34. factorio_ai_tools-1.1.1/src/factorio_ai_tools.egg-info/entry_points.txt +2 -0
  35. factorio_ai_tools-1.1.1/src/factorio_ai_tools.egg-info/requires.txt +9 -0
  36. factorio_ai_tools-1.1.1/src/factorio_ai_tools.egg-info/top_level.txt +1 -0
  37. factorio_ai_tools-1.1.1/start_mcp_server.bat +4 -0
  38. factorio_ai_tools-1.1.1/uv.lock +2176 -0
@@ -0,0 +1,11 @@
1
+ ## Python Windows Terminal Printing
2
+ When writing Python scripts that scrape the web or process LLM responses, NEVER directly `print()` raw dynamic strings. Windows PowerShell default encodings will throw a fatal `UnicodeEncodeError` when encountering characters like en-dashes or emojis.
3
+ *Fix:* Always encode/decode before printing:
4
+ ```python
5
+ print(text.encode('ascii', 'replace').decode('ascii'))
6
+ ```
7
+
8
+ ## Comprehensive Git Commits
9
+ When tasked with committing and pushing code to a repository, NEVER assume you have tracked all necessary files based on memory.
10
+ - Before confirming to the user that "all changes have been pushed", you MUST run `git status` to explicitly verify that no unexpectedly modified or untracked files were left behind in the working directory.
11
+ - Ensure that you either use `git add .` (if safe) or meticulously stage every modified file related to the current task before committing.
@@ -0,0 +1,32 @@
1
+ # Make line endings deterministic across machines, editors, and CI, so the
2
+ # repository stops drifting between CRLF and LF (and Git stops warning that
3
+ # "LF will be replaced by CRLF"). This file is authoritative and overrides
4
+ # each developer's personal `core.autocrlf` setting for this project.
5
+
6
+ # Default: let Git auto-detect text vs binary. Text is stored as LF in the repo.
7
+ * text=auto
8
+
9
+ # Source and config: enforce LF in the working tree too (not just in the repo),
10
+ # so the checked-out files always match what is committed. The repo is already
11
+ # all-LF, so this introduces no content churn.
12
+ *.py text eol=lf
13
+ *.md text eol=lf
14
+ *.txt text eol=lf
15
+ *.yaml text eol=lf
16
+ *.yml text eol=lf
17
+ *.cfg text eol=lf
18
+ *.toml text eol=lf
19
+ *.json text eol=lf
20
+ *.sh text eol=lf
21
+ .gitignore text eol=lf
22
+ .gitattributes text eol=lf
23
+
24
+ # Git hooks are run by sh and must keep LF endings (no file extension, so the
25
+ # rules above don't catch them); CRLF would break them on Windows checkout.
26
+ maintenance/hooks/* text eol=lf
27
+
28
+ # Pre-built LanceDB vector stores are binary data — never apply line-ending
29
+ # (or any) conversion. These rules come last so they win over the patterns
30
+ # above for any text-looking files (hint .json, version.txt) inside the stores.
31
+ data/*_lancedb/** binary
32
+ *.lance binary
@@ -0,0 +1,28 @@
1
+ name-template: 'v$NEXT_PATCH_VERSION'
2
+ tag-template: 'v$NEXT_PATCH_VERSION'
3
+ categories:
4
+ - title: '🚀 Features'
5
+ labels:
6
+ - 'feature'
7
+ - 'enhancement'
8
+ - title: '🐛 Bug Fixes'
9
+ labels:
10
+ - 'fix'
11
+ - 'bug'
12
+ - title: '📊 Dataset Updates'
13
+ labels:
14
+ - 'dataset'
15
+ - 'data'
16
+ - title: '🧰 Maintenance'
17
+ labels:
18
+ - 'chore'
19
+ - 'refactor'
20
+ - 'dependencies'
21
+ change-template: '- $TITLE @$AUTHOR (#$NUMBER)'
22
+ template: |
23
+ ## Changes
24
+
25
+ $CHANGES
26
+
27
+ ---
28
+ *Automated release generated by Release Drafter.*
@@ -0,0 +1,46 @@
1
+ name: Build and Publish Docker (GHCR)
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+ workflow_dispatch:
7
+
8
+ env:
9
+ REGISTRY: ghcr.io
10
+ IMAGE_NAME: ${{ github.repository }}
11
+
12
+ jobs:
13
+ build-and-push-image:
14
+ runs-on: ubuntu-latest
15
+ permissions:
16
+ contents: read
17
+ packages: write
18
+
19
+ steps:
20
+ - name: Checkout repository
21
+ uses: actions/checkout@v4
22
+
23
+ - name: Log in to the Container registry
24
+ uses: docker/login-action@v3
25
+ with:
26
+ registry: ${{ env.REGISTRY }}
27
+ username: ${{ github.actor }}
28
+ password: ${{ secrets.GITHUB_TOKEN }}
29
+
30
+ - name: Extract metadata (tags, labels) for Docker
31
+ id: meta
32
+ uses: docker/metadata-action@v5
33
+ with:
34
+ images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
35
+ tags: |
36
+ type=semver,pattern={{version}}
37
+ type=semver,pattern={{major}}.{{minor}}
38
+ type=raw,value=latest,enable={{is_default_branch}}
39
+
40
+ - name: Build and push Docker image
41
+ uses: docker/build-push-action@v5
42
+ with:
43
+ context: .
44
+ push: true
45
+ tags: ${{ steps.meta.outputs.tags }}
46
+ labels: ${{ steps.meta.outputs.labels }}
@@ -0,0 +1,27 @@
1
+ name: Publish Databases to GitHub Release
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+ workflow_dispatch:
7
+
8
+ jobs:
9
+ publish-databases:
10
+ runs-on: ubuntu-latest
11
+ permissions:
12
+ contents: write
13
+
14
+ steps:
15
+ - name: Checkout repository
16
+ uses: actions/checkout@v4
17
+
18
+ - name: Zip LanceDB folders
19
+ run: |
20
+ cd data
21
+ zip -r ../factorio_lancedb.zip factorio_lancedb/ clusterio_lancedb/ wiki_lancedb/ mod_lancedb/ forum_lancedb/
22
+
23
+ - name: Upload databases zip to Release
24
+ uses: softprops/action-gh-release@v2
25
+ if: github.event_name == 'release'
26
+ with:
27
+ files: factorio_lancedb.zip
@@ -0,0 +1,27 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+ workflow_dispatch:
7
+
8
+ jobs:
9
+ publish-pypi:
10
+ runs-on: ubuntu-latest
11
+ permissions:
12
+ id-token: write # IMPORTANT: mandatory for trusted publishing
13
+ contents: read
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+ with:
17
+ fetch-depth: 0
18
+ - name: Set up Python
19
+ uses: actions/setup-python@v5
20
+ with:
21
+ python-version: "3.11"
22
+ - name: Install build tool
23
+ run: pip install build
24
+ - name: Build package
25
+ run: python -m build
26
+ - name: Publish to PyPI
27
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,22 @@
1
+ name: Release Drafter
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ # pull_request event is required only for autolabeler
8
+ pull_request:
9
+ types: [opened, reopened, synchronize]
10
+
11
+ permissions:
12
+ contents: write
13
+
14
+ jobs:
15
+ update_release_draft:
16
+ runs-on: ubuntu-latest
17
+ steps:
18
+ - uses: release-drafter/release-drafter@v6
19
+ with:
20
+ config-name: release-drafter.yml
21
+ env:
22
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -0,0 +1,15 @@
1
+ venv/
2
+ .mod_temp/
3
+ __pycache__/
4
+ .claude/
5
+ *.pyc
6
+ .env
7
+
8
+ # Build Artifacts
9
+ build/
10
+ dist/
11
+ *.egg-info/
12
+
13
+ # Local Data
14
+ data/
15
+ .venv/
@@ -0,0 +1,74 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Overview
6
+
7
+ A hybrid-search RAG system and FastMCP server that gives LLMs expert knowledge of Factorio modding and Clusterio plugin development. Five ingestion scripts scrape/parse external sources into local LanceDB vector stores; `server.py` exposes those stores to MCP clients (Claude Desktop, etc.) as search tools.
8
+
9
+ ## Commands
10
+
11
+ ```powershell
12
+ # One-time setup
13
+ python -m venv venv
14
+ .\venv\Scripts\Activate.ps1
15
+ pip install -r requirements.txt
16
+
17
+ # Build/refresh the vector databases (each is idempotent — safe to re-run).
18
+ # Scripts live in ingest/; every store is written under data/.
19
+ python ingest/ingest_factorio.py # -> data/factorio_lancedb (Lua API + prototype docs, multiple versions)
20
+ python ingest/ingest_clusterio.py # -> data/clusterio_lancedb (set CLUSTERIO_REPO; defaults to ./clusterio)
21
+ python ingest/ingest_wiki.py # -> data/wiki_lancedb (full Factorio wiki via MediaWiki API)
22
+ python ingest/ingest_forum.py # -> data/forum_lancedb (curated topics from forum_links.txt)
23
+ python ingest/ingest_github_mod.py --repo-url https://github.com/notnotmelon/maraxsis # -> data/mod_lancedb
24
+
25
+ # Compact/prune all data/*_lancedb stores (collapses LanceDB version history).
26
+ # Run before merging data changes to main; --check is a read-only guard.
27
+ python maintenance/compact_lancedb.py
28
+ python maintenance/compact_lancedb.py --check
29
+
30
+ # Run the MCP server (stdio transport)
31
+ python server.py
32
+ ```
33
+
34
+ There is no test suite, linter, or build step beyond the above. `smithery.yaml` defines the Smithery deployment (build = pip install + the three core ingest scripts; run = `python server.py`).
35
+
36
+ To test a tool manually, import `server.py` in a REPL — the `@mcp.tool()` functions are plain callables. Note that importing `server.py` eagerly loads the SentenceTransformer model and opens all five LanceDB connections.
37
+
38
+ ## Architecture
39
+
40
+ **Ingestion → LanceDB → MCP server.** Every database is built offline by an ingest script and queried at runtime by the server. The two halves only share the embedding model and the on-disk store; they never call each other.
41
+
42
+ **Shared embedding contract (must stay consistent across all scripts and the server):**
43
+ - Model: `BAAI/bge-base-en-v1.5`, overridable via `EMBEDDING_MODEL` env var. Device auto-selects CUDA→CPU.
44
+ - Vectors are **768-dim** and **L2-normalized** (`normalize_embeddings=True`). Any new ingest script or schema must match this dimension and normalization, or search breaks silently.
45
+
46
+ **Incremental / idempotent ingestion.** Every script hashes source content with SHA-256 (`get_hash`/`hashlib.sha256`) and stores it as `content_hash`. On re-run it compares hashes per source unit (per URL, per file, per wiki page, per forum topic) and **skips unchanged content; deletes-then-re-adds changed content.** This is why re-running an ingest script is cheap and safe.
47
+
48
+ **Schema migration guard.** Each ingest script checks whether the existing table has the current columns (e.g. `if "content_hash" not in table.schema.names`) and **drops + recreates the whole table** if the schema is stale. Changing a `LanceModel`/pyarrow schema therefore forces a full re-ingest of that store — account for that when editing schemas.
49
+
50
+ **Code-aware chunking via Tree-sitter.** `ingest_clusterio.py` (TypeScript) and `ingest_github_mod.py` (Lua) parse files into AST nodes (classes, functions, methods, interfaces/tables) using a Tree-sitter `Query`, and store each node as a chunk with `node_type`/`node_name`. Non-code files fall back to fixed-size sliding-window text chunking (`extract_text_chunks`, 1500 chars / 200 overlap). The doc/wiki/forum scripts use plain text chunking only.
51
+
52
+ **Per-store table names and key columns** (the server opens these by exact name; all stores live under `data/`):
53
+ - `data/factorio_lancedb` → table `docs`: `text`, `class_name`, `version`, `url`, `node_type`. Holds **multiple Factorio versions** (`["1.1.110", "latest"]`); search filters by `version`. Writes `version.txt`.
54
+ - `data/clusterio_lancedb` → table `codebase`: `content`, `file_path`, `node_type`, `node_name`. Writes `version.txt` from the repo's `package.json`.
55
+ - `data/wiki_lancedb` → table `docs`: `text`, `title`, `url`.
56
+ - `data/forum_lancedb` → table `forum`: `content`, `class_name` (= topic title), `file_path` (= URL), `version`.
57
+ - `data/mod_lancedb` → table `codebase`: `content`, `repo_url`, `file_path`, `node_type`, `node_name`. One store holds **multiple mods**; search filters by `mod_name` via `repo_url LIKE`.
58
+
59
+ **Server resilience.** `server.py` opens each table in its own try/except and sets the handle to `None` on failure, so a missing database degrades only the affected tool (which returns a "run ingest_X.py first" error) rather than crashing the server. Search tools accept a **list of queries** (batched encode) and clamp `limit` to 1–20.
60
+
61
+ **Non-search tools** in `server.py` are self-contained (no DB): `decode_factorio_blueprint`/`encode_factorio_blueprint` (base64+zlib, version byte `0`, 10 MB decompress guard), `factorio_mod_portal_analyzer` (mods.factorio.com API), `factorio_log_inspector` (OS-aware path to `factorio-current.log`), `get_mcp_version_info` (reads the `version.txt` files).
62
+
63
+ ## Conventions (from `.agents/AGENTS.md`)
64
+
65
+ - **Windows console printing:** never `print()` raw dynamic/scraped strings — PowerShell's default encoding throws `UnicodeEncodeError` on en-dashes/emojis. Wrap dynamic output: `print(text.encode('ascii', 'replace').decode('ascii'))`. The ingest scripts already do this; preserve it.
66
+ - **Committing:** always run `git status` to verify nothing modified/untracked is left behind before telling the user changes are pushed. Stage deliberately.
67
+ - **SQL-string safety:** LanceDB `.where()` clauses are built with f-strings, so user/dynamic values are escaped by doubling single quotes (`value.replace("'", "''")`). Keep this when adding filters.
68
+
69
+ ## Data files & git
70
+
71
+ - The `data/*_lancedb/` stores are committed to the repo and marked `binary` in `.gitattributes` (no line-ending conversion). `.mod_temp/` (clone scratch for `ingest_github_mod.py`, at repo root), `venv/`, `__pycache__/`, `.claude/`, and `.env` are gitignored.
72
+ - `.gitattributes` enforces LF line endings for all text/source files regardless of `core.autocrlf`.
73
+ - `forum_links.txt` is the curated input list for `ingest_forum.py` (one URL per line, `#` comments allowed).
74
+ - **LanceDB hygiene.** Stores are append-only and never self-prune, so each ingest run accumulates versions/fragments. Let that history grow on feature branches (a PR diff then shows what data changed), but run `python maintenance/compact_lancedb.py` before merging to `main` to collapse it (`Table.optimize()` with `cleanup_older_than=0`). `maintenance/hooks/pre-push` is an opt-in guard (`git config core.hooksPath maintenance/hooks`) that blocks pushes to `main` while any store is uncompacted, via `compact_lancedb.py --check`.
@@ -0,0 +1,19 @@
1
+ FROM python:3.11-slim
2
+
3
+ # Install git for any tree-sitter or fetching dependencies
4
+ RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
5
+
6
+ WORKDIR /app
7
+
8
+ # Copy requirements first to leverage Docker cache
9
+ COPY requirements.txt .
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ # Copy all python scripts and LanceDB vector databases
13
+ COPY . .
14
+
15
+ # Ensure the mcp server binds to stdio properly and PYTHONPATH is set for the src layout
16
+ ENV PYTHONUNBUFFERED=1
17
+ ENV PYTHONPATH=/app/src
18
+
19
+ ENTRYPOINT ["python", "-m", "factorio_ai_tools.server"]
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 solarcloud7
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,19 @@
1
+ .PHONY: help compact ingest-all mcp
2
+
3
+ help:
4
+ @echo "Available commands:"
5
+ @echo " make compact - Compact and condense the LanceDB database"
6
+ @echo " make ingest-all - Run the ingestion script for all configured repositories"
7
+ @echo " make mcp - Start the MCP server"
8
+
9
+ compact:
10
+ uv run --no-sync python src/factorio_ai_tools/ingest/compact_db.py
11
+
12
+ ingest-all:
13
+ uv run --no-sync python src/factorio_ai_tools/ingest/ingest_github_repo.py --repo-url https://github.com/clusterio/clusterio-docker.git
14
+ uv run --no-sync python src/factorio_ai_tools/ingest/ingest_github_repo.py --repo-url https://github.com/wube/factorio-data.git
15
+ uv run --no-sync python src/factorio_ai_tools/ingest/ingest_github_repo.py --repo-url https://github.com/redruin1/factorio-draftsman.git
16
+ uv run --no-sync python src/factorio_ai_tools/ingest/ingest_github_repo.py --repo-url https://github.com/Teoxoy/factorio-blueprint-editor.git
17
+
18
+ mcp:
19
+ .\start_mcp_server.bat
@@ -0,0 +1,137 @@
1
+ Metadata-Version: 2.4
2
+ Name: factorio-ai-tools
3
+ Version: 1.1.1
4
+ Summary: A lightning-fast, hybrid-search Vector Database and Model Context Protocol (MCP) server for Factorio modding.
5
+ Author: solarcloud7
6
+ Requires-Python: >=3.11
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: mcp[cli]>=1.1.2
10
+ Requires-Dist: lancedb>=0.17.0
11
+ Requires-Dist: beautifulsoup4>=4.12.0
12
+ Requires-Dist: pyarrow>=15.0.0
13
+ Requires-Dist: requests>=2.31.0
14
+ Requires-Dist: hf-transfer>=0.1.0
15
+ Requires-Dist: tree-sitter-lua>=0.2.0
16
+ Requires-Dist: torch>=2.12.1
17
+ Requires-Dist: sentence-transformers>=5.6.0
18
+ Dynamic: license-file
19
+
20
+ <div align="center">
21
+ <img src="docs/assets/factorio-ai-tools.png" alt="Factorio AI Tools Icon" width="200"/>
22
+ <br/>
23
+ <a href="https://pypi.org/project/factorio-ai-tools/"><img src="https://img.shields.io/pypi/v/factorio-ai-tools" alt="PyPI - Version"/></a>
24
+ <a href="https://github.com/solarcloud7/factorio-ai-tools/releases"><img src="https://img.shields.io/github/v/release/solarcloud7/factorio-ai-tools" alt="GitHub Release"/></a>
25
+ </div>
26
+
27
+ # Factorio AI Tools (MCP Server)
28
+
29
+ A lightning-fast, hybrid-search Vector Database and Model Context Protocol (MCP) server designed to give LLMs absolute expertise over Factorio modding and Clusterio plugin development.
30
+
31
+ ## Architecture
32
+
33
+ This project consists of 4 main components:
34
+ 1. **Factorio Docs Ingestion (`ingest_factorio.py`)**: Scrapes the official Lua API documentation and Data Phase Prototypes across multiple versions (e.g. `1.1.110` and `latest`).
35
+ 2. **Clusterio Codebase Ingestion (`ingest_clusterio.py`)**: Uses AST (Abstract Syntax Tree) parsing to semantically chunk the massive Node.js/TypeScript Clusterio plugin architecture.
36
+ 3. **Factorio Wiki Ingestion (`ingest_wiki.py`)**: Scrapes the official Factorio Wiki via the MediaWiki API, exclusively extracting English wikitext for gameplay mechanics, ratios, and formulas.
37
+ 4. **GitHub Mod Ingestion (`ingest_github_mod.py`)**: A generalized pipeline that clones, AST-parses (via `tree-sitter-lua`), and incrementally hashes any GitHub Mod codebase (e.g., Maraxsis) into a semantic `mod_lancedb` index.
38
+ 5. **FastMCP Server (`server.py`)**: The bridge that connects the underlying LanceDB vector databases to an LLM via the standard Model Context Protocol.
39
+
40
+ ## Setup & Usage
41
+
42
+ There are two primary ways to install and use this MCP server locally with Claude Desktop (or any other MCP client):
43
+
44
+ ### Method 1: Using `uvx` (Recommended)
45
+ If you have `uv` installed, this is the cleanest way to run the server. It will automatically download the package from PyPI and fetch the necessary vector databases on the first run.
46
+ Add the following to your Claude Desktop config (`%APPDATA%\Claude\claude_desktop_config.json` or `~/Library/Application Support/Claude/claude_desktop_config.json`):
47
+ ```json
48
+ {
49
+ "mcpServers": {
50
+ "factorio-ai-tools": {
51
+ "command": "uvx",
52
+ "args": ["factorio-ai-tools"]
53
+ }
54
+ }
55
+ }
56
+ ```
57
+
58
+ ### Method 2: Docker (Pre-packaged Datasets)
59
+ If you have Docker Desktop installed, you can simply pull the pre-packaged container natively. The Docker container includes the databases inside the image, so no additional downloads are required at runtime.
60
+ ```json
61
+ {
62
+ "mcpServers": {
63
+ "factorio-ai-tools": {
64
+ "command": "docker",
65
+ "args": ["run", "-i", "--rm", "ghcr.io/solarcloud7/factorio-ai-tools:latest"]
66
+ }
67
+ }
68
+ }
69
+ }
70
+ ```
71
+
72
+ ### Method 3: Global SSE Server (Save RAM/VRAM)
73
+ By default, standard `stdio` MCP execution spawns a completely separate Python process for *every single client connection*. Because this server uses PyTorch and `sentence-transformers`, every connection will load the embedding model again, consuming roughly ~500MB of RAM/VRAM per instance.
74
+
75
+ If you want to use the MCP server across multiple IDEs or workspaces simultaneously without duplicating memory, you can run a single global HTTP SSE server in the background:
76
+
77
+ ```powershell
78
+ uv run factorio-ai-tools --sse --port 8000
79
+ ```
80
+
81
+ Then, configure your IDE or Claude client to connect to the SSE endpoint (e.g., `http://localhost:8000/sse`) instead of executing the CLI via `stdio`.
82
+
83
+ ### Selective Tool Loading (Optional)
84
+ By default, the server loads all available tools. If you only want to expose specific tools to your LLM, you can use the `--enable-tools` or `--disable-tools` arguments.
85
+ For example, to *only* load the doc search and the blueprint decoder using `uvx`:
86
+ ```json
87
+ "command": "uvx",
88
+ "args": [
89
+ "factorio-ai-tools",
90
+ "--enable-tools", "search_factorio_docs,decode_factorio_blueprint"
91
+ ]
92
+ ```
93
+ ---
94
+
95
+ ### Manual Developer Setup
96
+ If you wish to run the python scripts manually or ingest custom codebases:
97
+ 1. Create a python virtual environment: `python -m venv venv` and activate it.
98
+ 2. Run `pip install -r requirements.txt`.
99
+ 3. *(Optional)* Run the ingestion scripts (`python -m factorio_ai_tools.ingest.ingest_factorio`, etc.) to rebuild the LanceDB tables.
100
+ 4. *(Optional)* Ingest a specific GitHub Mod:
101
+ ```powershell
102
+ python -m factorio_ai_tools.ingest.ingest_github_mod --repo-url https://github.com/notnotmelon/maraxsis
103
+ ```
104
+
105
+ ## Maintenance (Database Hygiene)
106
+
107
+ LanceDB is append-only: every ingest run adds new immutable versions and small data fragments, and **nothing is garbage-collected automatically**. Re-running an ingest script grows the on-disk history (e.g. `factorio_lancedb` had 155 versions / 469 files before its first compaction). To keep the committed stores lean:
108
+
109
+ ```powershell
110
+ python maintenance/compact_lancedb.py # compact + prune every data/*_lancedb store
111
+ python maintenance/compact_lancedb.py --check # read-only; exits non-zero if a store is uncompacted
112
+ ```
113
+
114
+ This runs LanceDB's `Table.optimize()` on each store — compacting fragments, pruning old versions, and folding new rows into existing indices. Do **not** run it while the server or an ingest script is writing.
115
+
116
+ **Recommended workflow:** let the version history accumulate on feature branches so a PR diff shows exactly what data changed, then run the compaction script before merging to `main` so the committed history stays collapsed.
117
+
118
+ To enforce that automatically, opt into the bundled pre-push guard (it blocks pushes to `main` while any store is uncompacted):
119
+
120
+ ```powershell
121
+ git config core.hooksPath maintenance/hooks
122
+ # or copy maintenance/hooks/pre-push into .git/hooks/
123
+ ```
124
+
125
+
126
+
127
+ ## Tools Included
128
+
129
+ - `search_factorio_docs`: Look up Lua Runtime API methods, concepts, events, and Data Phase prototypes. Supports version filtering (`1.1.110` vs `latest`).
130
+ - `search_clusterio_code`: Semantically search the Clusterio Node.js architecture.
131
+ - `search_factorio_wiki`: Access game mechanics, ratios, and fluid mechanics straight from the Wiki.
132
+ - `search_mod_code`: Semantically search through specific downloaded GitHub mods (e.g., `maraxsis`) to read their Lua codebase.
133
+ - `decode_factorio_blueprint`: Convert Factorio blueprint strings (e.g. `0eNq...`) into easily readable/editable JSON.
134
+ - `encode_factorio_blueprint`: Compress generated JSON back into an importable Factorio blueprint string.
135
+ - `factorio_mod_portal_analyzer`: Scrape and summarize the Factorio Mod Portal for any given mod to retrieve dependencies and release versions.
136
+
137
+ - `get_mcp_version_info`: Self-diagnostics tool to verify the currently loaded database versions.
@@ -0,0 +1,118 @@
1
+ <div align="center">
2
+ <img src="docs/assets/factorio-ai-tools.png" alt="Factorio AI Tools Icon" width="200"/>
3
+ <br/>
4
+ <a href="https://pypi.org/project/factorio-ai-tools/"><img src="https://img.shields.io/pypi/v/factorio-ai-tools" alt="PyPI - Version"/></a>
5
+ <a href="https://github.com/solarcloud7/factorio-ai-tools/releases"><img src="https://img.shields.io/github/v/release/solarcloud7/factorio-ai-tools" alt="GitHub Release"/></a>
6
+ </div>
7
+
8
+ # Factorio AI Tools (MCP Server)
9
+
10
+ A lightning-fast, hybrid-search Vector Database and Model Context Protocol (MCP) server designed to give LLMs absolute expertise over Factorio modding and Clusterio plugin development.
11
+
12
+ ## Architecture
13
+
14
+ This project consists of 4 main components:
15
+ 1. **Factorio Docs Ingestion (`ingest_factorio.py`)**: Scrapes the official Lua API documentation and Data Phase Prototypes across multiple versions (e.g. `1.1.110` and `latest`).
16
+ 2. **Clusterio Codebase Ingestion (`ingest_clusterio.py`)**: Uses AST (Abstract Syntax Tree) parsing to semantically chunk the massive Node.js/TypeScript Clusterio plugin architecture.
17
+ 3. **Factorio Wiki Ingestion (`ingest_wiki.py`)**: Scrapes the official Factorio Wiki via the MediaWiki API, exclusively extracting English wikitext for gameplay mechanics, ratios, and formulas.
18
+ 4. **GitHub Mod Ingestion (`ingest_github_mod.py`)**: A generalized pipeline that clones, AST-parses (via `tree-sitter-lua`), and incrementally hashes any GitHub Mod codebase (e.g., Maraxsis) into a semantic `mod_lancedb` index.
19
+ 5. **FastMCP Server (`server.py`)**: The bridge that connects the underlying LanceDB vector databases to an LLM via the standard Model Context Protocol.
20
+
21
+ ## Setup & Usage
22
+
23
+ There are two primary ways to install and use this MCP server locally with Claude Desktop (or any other MCP client):
24
+
25
+ ### Method 1: Using `uvx` (Recommended)
26
+ If you have `uv` installed, this is the cleanest way to run the server. It will automatically download the package from PyPI and fetch the necessary vector databases on the first run.
27
+ Add the following to your Claude Desktop config (`%APPDATA%\Claude\claude_desktop_config.json` or `~/Library/Application Support/Claude/claude_desktop_config.json`):
28
+ ```json
29
+ {
30
+ "mcpServers": {
31
+ "factorio-ai-tools": {
32
+ "command": "uvx",
33
+ "args": ["factorio-ai-tools"]
34
+ }
35
+ }
36
+ }
37
+ ```
38
+
39
+ ### Method 2: Docker (Pre-packaged Datasets)
40
+ If you have Docker Desktop installed, you can simply pull the pre-packaged container natively. The Docker container includes the databases inside the image, so no additional downloads are required at runtime.
41
+ ```json
42
+ {
43
+ "mcpServers": {
44
+ "factorio-ai-tools": {
45
+ "command": "docker",
46
+ "args": ["run", "-i", "--rm", "ghcr.io/solarcloud7/factorio-ai-tools:latest"]
47
+ }
48
+ }
49
+ }
50
+ }
51
+ ```
52
+
53
+ ### Method 3: Global SSE Server (Save RAM/VRAM)
54
+ By default, standard `stdio` MCP execution spawns a completely separate Python process for *every single client connection*. Because this server uses PyTorch and `sentence-transformers`, every connection will load the embedding model again, consuming roughly ~500MB of RAM/VRAM per instance.
55
+
56
+ If you want to use the MCP server across multiple IDEs or workspaces simultaneously without duplicating memory, you can run a single global HTTP SSE server in the background:
57
+
58
+ ```powershell
59
+ uv run factorio-ai-tools --sse --port 8000
60
+ ```
61
+
62
+ Then, configure your IDE or Claude client to connect to the SSE endpoint (e.g., `http://localhost:8000/sse`) instead of executing the CLI via `stdio`.
63
+
64
+ ### Selective Tool Loading (Optional)
65
+ By default, the server loads all available tools. If you only want to expose specific tools to your LLM, you can use the `--enable-tools` or `--disable-tools` arguments.
66
+ For example, to *only* load the doc search and the blueprint decoder using `uvx`:
67
+ ```json
68
+ "command": "uvx",
69
+ "args": [
70
+ "factorio-ai-tools",
71
+ "--enable-tools", "search_factorio_docs,decode_factorio_blueprint"
72
+ ]
73
+ ```
74
+ ---
75
+
76
+ ### Manual Developer Setup
77
+ If you wish to run the python scripts manually or ingest custom codebases:
78
+ 1. Create a python virtual environment: `python -m venv venv` and activate it.
79
+ 2. Run `pip install -r requirements.txt`.
80
+ 3. *(Optional)* Run the ingestion scripts (`python -m factorio_ai_tools.ingest.ingest_factorio`, etc.) to rebuild the LanceDB tables.
81
+ 4. *(Optional)* Ingest a specific GitHub Mod:
82
+ ```powershell
83
+ python -m factorio_ai_tools.ingest.ingest_github_mod --repo-url https://github.com/notnotmelon/maraxsis
84
+ ```
85
+
86
+ ## Maintenance (Database Hygiene)
87
+
88
+ LanceDB is append-only: every ingest run adds new immutable versions and small data fragments, and **nothing is garbage-collected automatically**. Re-running an ingest script grows the on-disk history (e.g. `factorio_lancedb` had 155 versions / 469 files before its first compaction). To keep the committed stores lean:
89
+
90
+ ```powershell
91
+ python maintenance/compact_lancedb.py # compact + prune every data/*_lancedb store
92
+ python maintenance/compact_lancedb.py --check # read-only; exits non-zero if a store is uncompacted
93
+ ```
94
+
95
+ This runs LanceDB's `Table.optimize()` on each store — compacting fragments, pruning old versions, and folding new rows into existing indices. Do **not** run it while the server or an ingest script is writing.
96
+
97
+ **Recommended workflow:** let the version history accumulate on feature branches so a PR diff shows exactly what data changed, then run the compaction script before merging to `main` so the committed history stays collapsed.
98
+
99
+ To enforce that automatically, opt into the bundled pre-push guard (it blocks pushes to `main` while any store is uncompacted):
100
+
101
+ ```powershell
102
+ git config core.hooksPath maintenance/hooks
103
+ # or copy maintenance/hooks/pre-push into .git/hooks/
104
+ ```
105
+
106
+
107
+
108
+ ## Tools Included
109
+
110
+ - `search_factorio_docs`: Look up Lua Runtime API methods, concepts, events, and Data Phase prototypes. Supports version filtering (`1.1.110` vs `latest`).
111
+ - `search_clusterio_code`: Semantically search the Clusterio Node.js architecture.
112
+ - `search_factorio_wiki`: Access game mechanics, ratios, and fluid mechanics straight from the Wiki.
113
+ - `search_mod_code`: Semantically search through specific downloaded GitHub mods (e.g., `maraxsis`) to read their Lua codebase.
114
+ - `decode_factorio_blueprint`: Convert Factorio blueprint strings (e.g. `0eNq...`) into easily readable/editable JSON.
115
+ - `encode_factorio_blueprint`: Compress generated JSON back into an importable Factorio blueprint string.
116
+ - `factorio_mod_portal_analyzer`: Scrape and summarize the Factorio Mod Portal for any given mod to retrieve dependencies and release versions.
117
+
118
+ - `get_mcp_version_info`: Self-diagnostics tool to verify the currently loaded database versions.