malimgraph 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
malimgraph/__init__.py ADDED
@@ -0,0 +1,29 @@
1
+ """MalimGraph — Transform PDF documents into structured knowledge graphs."""
2
+
3
+ __version__ = "0.1.1"
4
+ __author__ = "Malim AI Labs"
5
+ __email__ = "hello@malim.my"
6
+
7
+ from malimgraph.schemas.chunks import Chunk, ChunkCollection
8
+ from malimgraph.schemas.entities import (
9
+ Citation,
10
+ Confidence,
11
+ Entity,
12
+ ExtractionMethod,
13
+ GraphMetadata,
14
+ KnowledgeGraph,
15
+ Relationship,
16
+ )
17
+
18
+ __all__ = [
19
+ "Citation",
20
+ "Chunk",
21
+ "ChunkCollection",
22
+ "Confidence",
23
+ "Entity",
24
+ "ExtractionMethod",
25
+ "GraphMetadata",
26
+ "KnowledgeGraph",
27
+ "Relationship",
28
+ "__version__",
29
+ ]
malimgraph/cli.py ADDED
@@ -0,0 +1,516 @@
1
+ """MalimGraph CLI — click-based command interface."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ import sys
8
+
9
+ import click
10
+
11
+
12
+ @click.group()
13
+ @click.version_option(package_name="malimgraph")
14
+ def cli():
15
+ """MalimGraph — Transform PDF documents into structured knowledge graphs."""
16
+ pass
17
+
18
+
19
+ @cli.command("extract")
20
+ @click.option(
21
+ "--input",
22
+ "-i",
23
+ "input_path",
24
+ required=True,
25
+ type=click.Path(exists=True),
26
+ help="Path to the PDF file.",
27
+ )
28
+ @click.option(
29
+ "--output", "-o", "output_dir", default="./output", show_default=True, help="Output directory."
30
+ )
31
+ @click.option(
32
+ "--entity-types",
33
+ default="auto",
34
+ show_default=True,
35
+ help="Comma-separated entity types or 'auto'.",
36
+ )
37
+ @click.option(
38
+ "--format",
39
+ "output_format",
40
+ default="all",
41
+ show_default=True,
42
+ type=click.Choice(["json", "cypher", "age_sql", "all"]),
43
+ help="Output format(s).",
44
+ )
45
+ @click.option(
46
+ "--graph-name", default="document_graph", show_default=True, help="Graph name for AGE SQL."
47
+ )
48
+ def extract_cmd(input_path, output_dir, entity_types, output_format, graph_name):
49
+ """Extract a knowledge graph from a PDF document."""
50
+ from malimgraph.core.graph_builder import build_knowledge_graph
51
+ from malimgraph.core.llm_extractor import extract_by_llm
52
+ from malimgraph.core.pdf_reader import extract_text_from_pdf
53
+ from malimgraph.core.rule_extractor import extract_by_rules
54
+ from malimgraph.generators.age_sql import generate_age_sql
55
+ from malimgraph.generators.cypher import generate_cypher
56
+
57
+ os.makedirs(output_dir, exist_ok=True)
58
+
59
+ etype_list = None
60
+ if entity_types and entity_types.lower() != "auto":
61
+ etype_list = [e.strip() for e in entity_types.split(",") if e.strip()]
62
+
63
+ click.echo(f"[extract] Reading: {input_path}")
64
+ doc = extract_text_from_pdf(input_path)
65
+ click.echo(f" → {doc.total_pages} pages")
66
+
67
+ click.echo("[extract] Rule-based extraction...")
68
+ rule_entities = extract_by_rules(doc)
69
+ click.echo(f" → {len(rule_entities)} entities")
70
+
71
+ click.echo("[extract] LLM extraction (requires ANTHROPIC_API_KEY)...")
72
+ try:
73
+ llm_entities, llm_relationships = extract_by_llm(doc, entity_types=etype_list)
74
+ click.echo(f" → {len(llm_entities)} entities, {len(llm_relationships)} relationships")
75
+ except ValueError as e:
76
+ click.echo(f" [Warning] Skipped: {e}", err=True)
77
+ llm_entities, llm_relationships = [], []
78
+
79
+ kg = build_knowledge_graph(doc, rule_entities, llm_entities, llm_relationships, graph_name)
80
+ click.echo(
81
+ f"[extract] Graph: {kg.metadata.total_entities} entities, {kg.metadata.total_relationships} relationships"
82
+ )
83
+
84
+ # Write JSON
85
+ kg_path = os.path.join(output_dir, "knowledge_graph.json")
86
+ with open(kg_path, "w", encoding="utf-8") as f:
87
+ json.dump(kg.model_dump(), f, indent=2, ensure_ascii=False)
88
+ click.echo(f" ✓ {kg_path}")
89
+
90
+ if output_format in ("cypher", "all"):
91
+ cypher_path = os.path.join(output_dir, "knowledge_graph.cypher")
92
+ with open(cypher_path, "w", encoding="utf-8") as f:
93
+ f.write(generate_cypher(kg))
94
+ click.echo(f" ✓ {cypher_path}")
95
+
96
+ if output_format in ("age_sql", "all"):
97
+ sql_path = os.path.join(output_dir, "knowledge_graph.sql")
98
+ with open(sql_path, "w", encoding="utf-8") as f:
99
+ f.write(generate_age_sql(kg, graph_name=graph_name))
100
+ click.echo(f" ✓ {sql_path}")
101
+
102
+ click.echo("[extract] Done.")
103
+
104
+
105
+ @cli.command("chunk")
106
+ @click.option("--input", "-i", "input_path", required=True, type=click.Path(exists=True))
107
+ @click.option("--output", "-o", "output_dir", default="./chunks", show_default=True)
108
+ @click.option("--chunk-size", default=512, show_default=True, type=int)
109
+ @click.option("--overlap", default=64, show_default=True, type=int)
110
+ @click.option(
111
+ "--format",
112
+ "output_format",
113
+ default="json",
114
+ show_default=True,
115
+ type=click.Choice(["json", "txt", "md"]),
116
+ )
117
+ def chunk_cmd(input_path, output_dir, chunk_size, overlap, output_format):
118
+ """Split a PDF into embedding-ready text chunks."""
119
+ from malimgraph.core.chunker import chunk_document
120
+ from malimgraph.core.pdf_reader import extract_text_from_pdf
121
+
122
+ os.makedirs(output_dir, exist_ok=True)
123
+
124
+ click.echo(f"[chunk] Reading: {input_path}")
125
+ doc = extract_text_from_pdf(input_path)
126
+ collection = chunk_document(doc, chunk_size=chunk_size, chunk_overlap=overlap)
127
+ click.echo(
128
+ f" → {collection.metadata.total_chunks} chunks, {collection.metadata.total_tokens} tokens"
129
+ )
130
+
131
+ if output_format == "json":
132
+ out_path = os.path.join(output_dir, "chunks.json")
133
+ with open(out_path, "w", encoding="utf-8") as f:
134
+ json.dump(collection.model_dump(), f, indent=2, ensure_ascii=False)
135
+ click.echo(f" ✓ {out_path}")
136
+
137
+ elif output_format == "txt":
138
+ for chunk in collection.chunks:
139
+ fname = os.path.join(output_dir, f"{chunk.chunk_id}.txt")
140
+ frontmatter = (
141
+ f"---\nchunk_id: {chunk.chunk_id}\npages: {chunk.source_pages}\n"
142
+ f"tokens: {chunk.token_count}\nheading_context: {chunk.heading_context}\n---\n\n"
143
+ )
144
+ with open(fname, "w", encoding="utf-8") as f:
145
+ f.write(frontmatter + chunk.text)
146
+ click.echo(f" ✓ {collection.metadata.total_chunks} .txt files in {output_dir}/")
147
+
148
+ elif output_format == "md":
149
+ lines = [f"# Chunks — {collection.metadata.source_file}\n"]
150
+ for chunk in collection.chunks:
151
+ lines.append(f"## Chunk {chunk.position.index + 1} of {chunk.position.total}")
152
+ lines.append(f"**Pages:** {chunk.source_pages} ")
153
+ lines.append(f"**Tokens:** {chunk.token_count} ")
154
+ if chunk.heading_context:
155
+ lines.append(f"**Context:** {' > '.join(chunk.heading_context)} ")
156
+ lines.append("")
157
+ lines.append(chunk.text)
158
+ lines.append("\n---\n")
159
+ out_path = os.path.join(output_dir, "chunks.md")
160
+ with open(out_path, "w", encoding="utf-8") as f:
161
+ f.write("\n".join(lines))
162
+ click.echo(f" ✓ {out_path}")
163
+
164
+ click.echo("[chunk] Done.")
165
+
166
+
167
+ @cli.command("render")
168
+ @click.option("--input", "-i", "input_path", required=True, type=click.Path(exists=True))
169
+ @click.option("--output", "-o", "output_path", default="document.html", show_default=True)
170
+ @click.option(
171
+ "--knowledge-graph",
172
+ "kg_path",
173
+ default=None,
174
+ type=click.Path(),
175
+ help="knowledge_graph.json for entity annotations.",
176
+ )
177
+ @click.option("--toc/--no-toc", default=True, show_default=True)
178
+ @click.option("--search/--no-search", default=True, show_default=True)
179
+ def render_cmd(input_path, output_path, kg_path, toc, search):
180
+ """Render a PDF as structured, LLM-readable HTML."""
181
+ from malimgraph.core.html_renderer import render_document_html
182
+ from malimgraph.core.pdf_reader import extract_text_from_pdf
183
+ from malimgraph.schemas.entities import KnowledgeGraph
184
+
185
+ click.echo(f"[render] Reading: {input_path}")
186
+ doc = extract_text_from_pdf(input_path)
187
+
188
+ kg = None
189
+ if kg_path and os.path.exists(kg_path):
190
+ with open(kg_path, "r", encoding="utf-8") as f:
191
+ kg = KnowledgeGraph.model_validate(json.load(f))
192
+ click.echo(f" → Annotating with {len(kg.entities)} entities from {kg_path}")
193
+
194
+ html_content = render_document_html(
195
+ doc, knowledge_graph=kg, include_toc=toc, include_search=search
196
+ )
197
+
198
+ os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
199
+ with open(output_path, "w", encoding="utf-8") as f:
200
+ f.write(html_content)
201
+ click.echo(f" ✓ {output_path}")
202
+ click.echo("[render] Done.")
203
+
204
+
205
+ @cli.group("db")
206
+ def db_group():
207
+ """Graph database management (Neo4j / Apache AGE)."""
208
+ pass
209
+
210
+
211
+ @db_group.command("load")
212
+ @click.option(
213
+ "--input",
214
+ "-i",
215
+ "input_path",
216
+ required=True,
217
+ type=click.Path(exists=True),
218
+ help="knowledge_graph.json to load.",
219
+ )
220
+ @click.option("--target", default="neo4j", show_default=True, type=click.Choice(["neo4j", "age"]))
221
+ @click.option("--uri", default=None, help="Connection URI.")
222
+ @click.option("--user", default=None, help="Neo4j user.")
223
+ @click.option("--password", default=None, help="Neo4j password.")
224
+ @click.option("--graph-name", default="document_graph", show_default=True)
225
+ def db_load(input_path, target, uri, user, password, graph_name):
226
+ """Load a knowledge graph into Neo4j or Apache AGE."""
227
+ from malimgraph.core.db_client import get_client
228
+ from malimgraph.schemas.entities import KnowledgeGraph
229
+
230
+ with open(input_path, "r", encoding="utf-8") as f:
231
+ kg = KnowledgeGraph.model_validate(json.load(f))
232
+
233
+ kwargs = {"graph_name": graph_name}
234
+ if uri:
235
+ kwargs["uri"] = uri
236
+ if user:
237
+ kwargs["user"] = user
238
+ if password:
239
+ kwargs["password"] = password
240
+
241
+ client = get_client(target, **kwargs)
242
+ try:
243
+ result = client.load_graph(kg)
244
+ click.echo(f" ✓ Loaded: {result}")
245
+ finally:
246
+ client.close()
247
+
248
+
249
+ @db_group.command("query")
250
+ @click.option("--target", default="neo4j", show_default=True, type=click.Choice(["neo4j", "age"]))
251
+ @click.option("--uri", default=None)
252
+ @click.option("--user", default=None)
253
+ @click.option("--password", default=None)
254
+ @click.option("--graph-name", default="document_graph", show_default=True)
255
+ @click.option("--query", "-q", "cypher_query", required=True)
256
+ def db_query(target, uri, user, password, graph_name, cypher_query):
257
+ """Run a Cypher query against a graph database."""
258
+ from malimgraph.core.db_client import get_client
259
+
260
+ kwargs = {"graph_name": graph_name}
261
+ if uri:
262
+ kwargs["uri"] = uri
263
+ if user:
264
+ kwargs["user"] = user
265
+ if password:
266
+ kwargs["password"] = password
267
+
268
+ client = get_client(target, **kwargs)
269
+ try:
270
+ rows = client.query(cypher_query)
271
+ click.echo(json.dumps(rows, indent=2))
272
+ finally:
273
+ client.close()
274
+
275
+
276
+ @db_group.command("stats")
277
+ @click.option("--target", default="neo4j", show_default=True, type=click.Choice(["neo4j", "age"]))
278
+ @click.option("--uri", default=None)
279
+ @click.option("--user", default=None)
280
+ @click.option("--password", default=None)
281
+ @click.option("--graph-name", default="document_graph", show_default=True)
282
+ def db_stats(target, uri, user, password, graph_name):
283
+ """Show graph database statistics."""
284
+ from malimgraph.core.db_client import get_client
285
+
286
+ kwargs = {"graph_name": graph_name}
287
+ if uri:
288
+ kwargs["uri"] = uri
289
+ if user:
290
+ kwargs["user"] = user
291
+ if password:
292
+ kwargs["password"] = password
293
+
294
+ client = get_client(target, **kwargs)
295
+ try:
296
+ stats = client.stats()
297
+ click.echo(json.dumps(stats, indent=2))
298
+ finally:
299
+ client.close()
300
+
301
+
302
+ @cli.group("vector")
303
+ def vector_group():
304
+ """PostgreSQL pgvector — embed and search document chunks."""
305
+ pass
306
+
307
+
308
+ @vector_group.command("load")
309
+ @click.option(
310
+ "--input",
311
+ "-i",
312
+ "input_path",
313
+ required=True,
314
+ type=click.Path(exists=True),
315
+ help="chunks.json from malimgraph chunk.",
316
+ )
317
+ @click.option("--uri", default=None, envvar="PGVECTOR_URI", help="PostgreSQL connection URI.")
318
+ @click.option("--table", default="document_chunks", show_default=True, help="Target table name.")
319
+ @click.option(
320
+ "--provider",
321
+ default="openai",
322
+ show_default=True,
323
+ type=click.Choice(["openai", "voyage", "local"]),
324
+ help="Embedding provider.",
325
+ )
326
+ @click.option(
327
+ "--model", default=None, help="Embedding model override (uses provider default if omitted)."
328
+ )
329
+ @click.option("--document-id", default=None, help="Document namespace (default: source filename).")
330
+ @click.option(
331
+ "--skip-existing/--no-skip-existing",
332
+ default=True,
333
+ show_default=True,
334
+ help="Skip chunks already in the table.",
335
+ )
336
+ def vector_load(input_path, uri, table, provider, model, document_id, skip_existing):
337
+ """Embed chunks and store them in PostgreSQL with pgvector."""
338
+ from malimgraph.core.embedder import EmbedderConfig
339
+ from malimgraph.core.vector_client import PgVectorClient
340
+ from malimgraph.schemas.chunks import ChunkCollection
341
+
342
+ if not uri:
343
+ click.echo("ERROR: --uri or PGVECTOR_URI env var required.", err=True)
344
+ sys.exit(1)
345
+
346
+ with open(input_path, "r", encoding="utf-8") as f:
347
+ collection = ChunkCollection.model_validate(json.load(f))
348
+
349
+ config = EmbedderConfig(provider=provider, model=model)
350
+ click.echo(
351
+ f"[vector] Provider: {config.provider} / Model: {config.model} (dim={config.dimension})"
352
+ )
353
+ click.echo(f"[vector] Chunks to process: {collection.metadata.total_chunks}")
354
+
355
+ client = PgVectorClient(uri, table_name=table, embedder_config=config)
356
+ try:
357
+ result = client.load_chunks(
358
+ collection, document_id=document_id, skip_existing=skip_existing
359
+ )
360
+ finally:
361
+ client.close()
362
+
363
+ click.echo(f" ✓ Inserted: {result['inserted']}")
364
+ click.echo(f" ✓ Updated: {result['updated']}")
365
+ click.echo(f" - Skipped: {result['skipped']}")
366
+ click.echo("[vector] Done.")
367
+
368
+
369
+ @vector_group.command("search")
370
+ @click.option("--query", "-q", required=True, help="Search query text.")
371
+ @click.option("--uri", default=None, envvar="PGVECTOR_URI")
372
+ @click.option("--table", default="document_chunks", show_default=True)
373
+ @click.option(
374
+ "--provider",
375
+ default="openai",
376
+ show_default=True,
377
+ type=click.Choice(["openai", "voyage", "local"]),
378
+ )
379
+ @click.option("--model", default=None)
380
+ @click.option(
381
+ "--top-k", default=10, show_default=True, type=int, help="Number of results to return."
382
+ )
383
+ @click.option("--document-id", default=None, help="Limit search to a specific document.")
384
+ @click.option(
385
+ "--min-score",
386
+ default=0.0,
387
+ show_default=True,
388
+ type=float,
389
+ help="Minimum cosine similarity score.",
390
+ )
391
+ def vector_search(query, uri, table, provider, model, top_k, document_id, min_score):
392
+ """Semantic search across embedded chunks."""
393
+ from malimgraph.core.embedder import EmbedderConfig
394
+ from malimgraph.core.vector_client import PgVectorClient
395
+
396
+ if not uri:
397
+ click.echo("ERROR: --uri or PGVECTOR_URI env var required.", err=True)
398
+ sys.exit(1)
399
+
400
+ config = EmbedderConfig(provider=provider, model=model)
401
+ client = PgVectorClient(uri, table_name=table, embedder_config=config)
402
+ try:
403
+ results = client.similarity_search(
404
+ query, top_k=top_k, document_id=document_id, min_score=min_score
405
+ )
406
+ finally:
407
+ client.close()
408
+
409
+ click.echo(json.dumps(results, indent=2, default=str))
410
+ click.echo(f"\n{len(results)} result(s) returned.", err=True)
411
+
412
+
413
+ @vector_group.command("stats")
414
+ @click.option("--uri", default=None, envvar="PGVECTOR_URI")
415
+ @click.option("--table", default="document_chunks", show_default=True)
416
+ @click.option(
417
+ "--provider",
418
+ default="openai",
419
+ show_default=True,
420
+ type=click.Choice(["openai", "voyage", "local"]),
421
+ )
422
+ def vector_stats(uri, table, provider):
423
+ """Show pgvector table statistics."""
424
+ from malimgraph.core.embedder import EmbedderConfig
425
+ from malimgraph.core.vector_client import PgVectorClient
426
+
427
+ if not uri:
428
+ click.echo("ERROR: --uri or PGVECTOR_URI env var required.", err=True)
429
+ sys.exit(1)
430
+
431
+ config = EmbedderConfig(provider=provider)
432
+ client = PgVectorClient(uri, table_name=table, embedder_config=config)
433
+ try:
434
+ stats = client.stats()
435
+ finally:
436
+ client.close()
437
+
438
+ click.echo(json.dumps(stats, indent=2))
439
+
440
+
441
+ @vector_group.command("list")
442
+ @click.option("--uri", default=None, envvar="PGVECTOR_URI")
443
+ @click.option("--table", default="document_chunks", show_default=True)
444
+ @click.option(
445
+ "--provider",
446
+ default="openai",
447
+ show_default=True,
448
+ type=click.Choice(["openai", "voyage", "local"]),
449
+ )
450
+ def vector_list(uri, table, provider):
451
+ """List all indexed documents."""
452
+ from malimgraph.core.embedder import EmbedderConfig
453
+ from malimgraph.core.vector_client import PgVectorClient
454
+
455
+ if not uri:
456
+ click.echo("ERROR: --uri or PGVECTOR_URI env var required.", err=True)
457
+ sys.exit(1)
458
+
459
+ config = EmbedderConfig(provider=provider)
460
+ client = PgVectorClient(uri, table_name=table, embedder_config=config)
461
+ try:
462
+ docs = client.list_documents()
463
+ finally:
464
+ client.close()
465
+
466
+ click.echo(json.dumps(docs, indent=2, default=str))
467
+
468
+
469
+ @vector_group.command("delete")
470
+ @click.option("--document-id", required=True, help="Document ID to remove from the table.")
471
+ @click.option("--uri", default=None, envvar="PGVECTOR_URI")
472
+ @click.option("--table", default="document_chunks", show_default=True)
473
+ @click.option(
474
+ "--provider",
475
+ default="openai",
476
+ show_default=True,
477
+ type=click.Choice(["openai", "voyage", "local"]),
478
+ )
479
+ def vector_delete(document_id, uri, table, provider):
480
+ """Delete all chunks for a document from the vector table."""
481
+ from malimgraph.core.embedder import EmbedderConfig
482
+ from malimgraph.core.vector_client import PgVectorClient
483
+
484
+ if not uri:
485
+ click.echo("ERROR: --uri or PGVECTOR_URI env var required.", err=True)
486
+ sys.exit(1)
487
+
488
+ config = EmbedderConfig(provider=provider)
489
+ client = PgVectorClient(uri, table_name=table, embedder_config=config)
490
+ try:
491
+ deleted = client.delete_document(document_id)
492
+ finally:
493
+ client.close()
494
+
495
+ click.echo(f" ✓ Deleted {deleted} chunks for document '{document_id}'.")
496
+
497
+
498
+ @cli.command("serve")
499
+ @click.option(
500
+ "--transport", default="stdio", show_default=True, type=click.Choice(["stdio", "http"])
501
+ )
502
+ @click.option("--port", default=8080, show_default=True, type=int)
503
+ def serve_cmd(transport, port):
504
+ """Start the MalimGraph MCP server."""
505
+ from malimgraph.server import run_server
506
+
507
+ click.echo(
508
+ f"[serve] Starting MCP server (transport={transport}"
509
+ + (f", port={port}" if transport == "http" else "")
510
+ + ")"
511
+ )
512
+ run_server(transport=transport, port=port)
513
+
514
+
515
+ if __name__ == "__main__":
516
+ cli()
@@ -0,0 +1,15 @@
1
+ from malimgraph.core.chunker import chunk_document
2
+ from malimgraph.core.graph_builder import build_knowledge_graph
3
+ from malimgraph.core.html_renderer import render_document_html
4
+ from malimgraph.core.llm_extractor import extract_by_llm
5
+ from malimgraph.core.pdf_reader import extract_text_from_pdf
6
+ from malimgraph.core.rule_extractor import extract_by_rules
7
+
8
+ __all__ = [
9
+ "build_knowledge_graph",
10
+ "chunk_document",
11
+ "extract_by_llm",
12
+ "extract_by_rules",
13
+ "extract_text_from_pdf",
14
+ "render_document_html",
15
+ ]