wikigen 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wikigen/nodes/nodes.py ADDED
@@ -0,0 +1,1080 @@
1
+ import os
2
+ import time
3
+ import yaml
4
+ from pocketflow import Node, BatchNode
5
+ from wikigen.utils.crawl_github_files import crawl_github_files
6
+ from wikigen.utils.call_llm import call_llm
7
+ from wikigen.utils.crawl_local_files import crawl_local_files
8
+ from wikigen.formatter.output_formatter import (
9
+ Icons,
10
+ print_phase_start,
11
+ print_operation,
12
+ print_success,
13
+ print_phase_end,
14
+ format_size,
15
+ )
16
+
17
+
18
+ # Helper to get content for specific file indices
19
+ def get_content_for_indices(files_data, indices):
20
+ content_map = {}
21
+ for i in indices:
22
+ if 0 <= i < len(files_data):
23
+ path, content = files_data[i]
24
+ content_map[f"{i} # {path}"] = (
25
+ content # Use index + path as key for context
26
+ )
27
+ return content_map
28
+
29
+
30
+ class FetchRepo(Node):
31
+ def prep(self, shared):
32
+ repo_url = shared.get("repo_url")
33
+ local_dir = shared.get("local_dir")
34
+ project_name = shared.get("project_name")
35
+
36
+ if not project_name:
37
+ # Basic name derivation from URL or directory
38
+ if repo_url:
39
+ project_name = repo_url.split("/")[-1].replace(".git", "")
40
+ else:
41
+ project_name = os.path.basename(os.path.abspath(local_dir))
42
+ shared["project_name"] = project_name
43
+
44
+ # Get file patterns directly from shared
45
+ include_patterns = shared["include_patterns"]
46
+ exclude_patterns = shared["exclude_patterns"]
47
+ max_file_size = shared["max_file_size"]
48
+
49
+ return {
50
+ "repo_url": repo_url,
51
+ "local_dir": local_dir,
52
+ "token": shared.get("github_token"),
53
+ "include_patterns": include_patterns,
54
+ "exclude_patterns": exclude_patterns,
55
+ "max_file_size": max_file_size,
56
+ "use_relative_paths": True,
57
+ }
58
+
59
+ def exec(self, prep_res):
60
+ start_time = time.time()
61
+
62
+ if prep_res["repo_url"]:
63
+ print_phase_start("Repository Crawling", Icons.CRAWLING)
64
+ result = crawl_github_files(
65
+ repo_url=prep_res["repo_url"],
66
+ token=prep_res["token"],
67
+ include_patterns=prep_res["include_patterns"],
68
+ exclude_patterns=prep_res["exclude_patterns"],
69
+ max_file_size=prep_res["max_file_size"],
70
+ use_relative_paths=prep_res["use_relative_paths"],
71
+ )
72
+ else:
73
+ print_phase_start("Directory Crawling", Icons.CRAWLING)
74
+ result = crawl_local_files(
75
+ directory=prep_res["local_dir"],
76
+ include_patterns=prep_res["include_patterns"],
77
+ exclude_patterns=prep_res["exclude_patterns"],
78
+ max_file_size=prep_res["max_file_size"],
79
+ use_relative_paths=prep_res["use_relative_paths"],
80
+ )
81
+
82
+ # Convert dict to list of tuples: [(path, content), ...]
83
+ files_list = list(result.get("files", {}).items())
84
+ if len(files_list) == 0:
85
+ raise (ValueError("Failed to fetch files"))
86
+
87
+ # Calculate total size
88
+ total_size = sum(len(content) for _, content in files_list)
89
+ elapsed = time.time() - start_time
90
+
91
+ print_success(
92
+ f"Complete ({len(files_list)} files, {format_size(total_size)})",
93
+ elapsed,
94
+ indent=1,
95
+ )
96
+ print_phase_end()
97
+
98
+ return files_list
99
+
100
+ def post(self, shared, prep_res, exec_res):
101
+ shared["files"] = exec_res # List of (path, content) tuples
102
+
103
+
104
+ class IdentifyAbstractions(Node):
105
+ def prep(self, shared):
106
+ files_data = shared["files"]
107
+ project_name = shared["project_name"] # Get project name
108
+ language = shared.get("language", "english") # Get language
109
+ use_cache = shared.get("use_cache", True) # Get use_cache flag, default to True
110
+ max_abstraction_num = shared.get(
111
+ "max_abstraction_num", 10
112
+ ) # Get max_abstraction_num, default to 10
113
+
114
+ # Helper to create context from files, respecting limits (basic example)
115
+ def create_llm_context(files_data):
116
+ context = ""
117
+ file_info = [] # Store tuples of (index, path)
118
+ for i, (path, content) in enumerate(files_data):
119
+ entry = f"--- File Index {i}: {path} ---\n{content}\n\n"
120
+ context += entry
121
+ file_info.append((i, path))
122
+
123
+ return context, file_info # file_info is list of (index, path)
124
+
125
+ context, file_info = create_llm_context(files_data)
126
+ # Format file info for the prompt (comment is just a hint for LLM)
127
+ file_listing_for_prompt = "\n".join(
128
+ [f"- {idx} # {path}" for idx, path in file_info]
129
+ )
130
+ return (
131
+ context,
132
+ file_listing_for_prompt,
133
+ len(files_data),
134
+ project_name,
135
+ language,
136
+ use_cache,
137
+ max_abstraction_num,
138
+ ) # Return all parameters
139
+
140
+ def exec(self, prep_res):
141
+ start_time = time.time()
142
+ (
143
+ context,
144
+ file_listing_for_prompt,
145
+ file_count,
146
+ project_name,
147
+ language,
148
+ use_cache,
149
+ max_abstraction_num,
150
+ ) = prep_res # Unpack all parameters
151
+
152
+ print_phase_start("LLM Analysis", Icons.PROCESSING)
153
+ print_operation("Identifying abstractions...", Icons.PROCESSING, indent=1)
154
+
155
+ # Add language instruction and hints only if not English
156
+ language_instruction = ""
157
+ name_lang_hint = ""
158
+ desc_lang_hint = ""
159
+ if language.lower() != "english":
160
+ language_instruction = f"IMPORTANT: Generate the `name` and `description` for each abstraction in **{language.capitalize()}** language. Do NOT use English for these fields.\n\n"
161
+ # Keep specific hints here as name/description are primary targets
162
+ name_lang_hint = f" (value in {language.capitalize()})"
163
+ desc_lang_hint = f" (value in {language.capitalize()})"
164
+
165
+ prompt = f"""
166
+ For the project `{project_name}`:
167
+
168
+ Codebase Context:
169
+ {context}
170
+
171
+ {language_instruction}Analyze the codebase context.
172
+ Identify the top 5 to {max_abstraction_num} core most important abstractions for technical documentation that helps existing and new engineers understand the codebase.
173
+
174
+ For each abstraction, provide:
175
+ 1. A concise `name`{name_lang_hint}.
176
+ 2. A technical `description` explaining what it does, its responsibilities, and role in the system, in around 100 words{desc_lang_hint}.
177
+ 3. A list of relevant `file_indices` (integers) using the format `idx # path/comment`.
178
+
179
+ List of file indices and paths present in the context:
180
+ {file_listing_for_prompt}
181
+
182
+ Format the output as a YAML list of dictionaries:
183
+
184
+ ```yaml
185
+ - name: |
186
+ Query Processing{name_lang_hint}
187
+ description: |
188
+ Handles incoming queries and routes them to appropriate handlers.
189
+ Responsible for parsing, validation, and initial processing of user requests.{desc_lang_hint}
190
+ file_indices:
191
+ - 0 # path/to/file1.py
192
+ - 3 # path/to/related.py
193
+ - name: |
194
+ Query Optimization{name_lang_hint}
195
+ description: |
196
+ Optimizes query execution by analyzing patterns and caching results.
197
+ Manages performance improvements and resource allocation for query processing.{desc_lang_hint}
198
+ file_indices:
199
+ - 5 # path/to/another.js
200
+ # ... up to {max_abstraction_num} abstractions
201
+ ```"""
202
+ response = call_llm(
203
+ prompt, use_cache=(use_cache and self.cur_retry == 0)
204
+ ) # Use cache only if enabled and not retrying
205
+
206
+ # --- Validation ---
207
+ yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip()
208
+ abstractions = yaml.safe_load(yaml_str)
209
+
210
+ if not isinstance(abstractions, list):
211
+ raise ValueError("LLM Output is not a list")
212
+
213
+ validated_abstractions = []
214
+ for item in abstractions:
215
+ if not isinstance(item, dict) or not all(
216
+ k in item for k in ["name", "description", "file_indices"]
217
+ ):
218
+ raise ValueError(f"Missing keys in abstraction item: {item}")
219
+ if not isinstance(item["name"], str):
220
+ raise ValueError(f"Name is not a string in item: {item}")
221
+ if not isinstance(item["description"], str):
222
+ raise ValueError(f"Description is not a string in item: {item}")
223
+ if not isinstance(item["file_indices"], list):
224
+ raise ValueError(f"file_indices is not a list in item: {item}")
225
+
226
+ # Validate indices
227
+ validated_indices = []
228
+ for idx_entry in item["file_indices"]:
229
+ try:
230
+ if isinstance(idx_entry, int):
231
+ idx = idx_entry
232
+ elif isinstance(idx_entry, str) and "#" in idx_entry:
233
+ idx = int(idx_entry.split("#")[0].strip())
234
+ else:
235
+ idx = int(str(idx_entry).strip())
236
+
237
+ if not (0 <= idx < file_count):
238
+ raise ValueError(
239
+ f"Invalid file index {idx} found in item {item['name']}. Max index is {file_count - 1}."
240
+ )
241
+ validated_indices.append(idx)
242
+ except (ValueError, TypeError):
243
+ raise ValueError(
244
+ f"Could not parse index from entry: {idx_entry} in item {item['name']}"
245
+ )
246
+
247
+ item["files"] = sorted(list(set(validated_indices)))
248
+ # Store only the required fields
249
+ validated_abstractions.append(
250
+ {
251
+ "name": item["name"], # Potentially translated name
252
+ "description": item[
253
+ "description"
254
+ ], # Potentially translated description
255
+ "files": item["files"],
256
+ }
257
+ )
258
+
259
+ elapsed = time.time() - start_time
260
+ print_success(
261
+ f"Found {len(validated_abstractions)} abstractions", elapsed, indent=2
262
+ )
263
+
264
+ return validated_abstractions
265
+
266
+ def post(self, shared, prep_res, exec_res):
267
+ shared["abstractions"] = (
268
+ exec_res # List of {"name": str, "description": str, "files": [int]}
269
+ )
270
+
271
+
272
+ class AnalyzeRelationships(Node):
273
+ def prep(self, shared):
274
+ abstractions = shared[
275
+ "abstractions"
276
+ ] # Now contains 'files' list of indices, name/description potentially translated
277
+ files_data = shared["files"]
278
+ project_name = shared["project_name"] # Get project name
279
+ language = shared.get("language", "english") # Get language
280
+ use_cache = shared.get("use_cache", True) # Get use_cache flag, default to True
281
+
282
+ # Get the actual number of abstractions directly
283
+ num_abstractions = len(abstractions)
284
+
285
+ # Create context with abstraction names, indices, descriptions, and relevant file snippets
286
+ context = "Identified Abstractions:\\n"
287
+ all_relevant_indices = set()
288
+ abstraction_info_for_prompt = []
289
+ for i, abstr in enumerate(abstractions):
290
+ # Use 'files' which contains indices directly
291
+ file_indices_str = ", ".join(map(str, abstr["files"]))
292
+ # Abstraction name and description might be translated already
293
+ info_line = f"- Index {i}: {abstr['name']} (Relevant file indices: [{file_indices_str}])\\n Description: {abstr['description']}"
294
+ context += info_line + "\\n"
295
+ abstraction_info_for_prompt.append(
296
+ f"{i} # {abstr['name']}"
297
+ ) # Use potentially translated name here too
298
+ all_relevant_indices.update(abstr["files"])
299
+
300
+ context += "\\nRelevant File Snippets (Referenced by Index and Path):\\n"
301
+ # Get content for relevant files using helper
302
+ relevant_files_content_map = get_content_for_indices(
303
+ files_data, sorted(list(all_relevant_indices))
304
+ )
305
+ # Format file content for context
306
+ file_context_str = "\\n\\n".join(
307
+ f"--- File: {idx_path} ---\\n{content}"
308
+ for idx_path, content in relevant_files_content_map.items()
309
+ )
310
+ context += file_context_str
311
+
312
+ return (
313
+ context,
314
+ "\n".join(abstraction_info_for_prompt),
315
+ num_abstractions, # Pass the actual count
316
+ project_name,
317
+ language,
318
+ use_cache,
319
+ ) # Return use_cache
320
+
321
+ def exec(self, prep_res):
322
+ start_time = time.time()
323
+ (
324
+ context,
325
+ abstraction_listing,
326
+ num_abstractions, # Receive the actual count
327
+ project_name,
328
+ language,
329
+ use_cache,
330
+ ) = prep_res # Unpack use_cache
331
+
332
+ print_operation("Analyzing relationships...", Icons.ANALYZING, indent=1)
333
+
334
+ # Add language instruction and hints only if not English
335
+ language_instruction = ""
336
+ lang_hint = ""
337
+ list_lang_note = ""
338
+ if language.lower() != "english":
339
+ language_instruction = f"IMPORTANT: Generate the `summary` and relationship `label` fields in **{language.capitalize()}** language. Do NOT use English for these fields.\n\n"
340
+ lang_hint = f" (in {language.capitalize()})"
341
+ list_lang_note = f" (Names might be in {language.capitalize()})" # Note for the input list
342
+
343
+ prompt = f"""
344
+ Based on the following abstractions and relevant code snippets from the project `{project_name}`:
345
+
346
+ List of Abstraction Indices and Names{list_lang_note}:
347
+ {abstraction_listing}
348
+
349
+ Context (Abstractions, Descriptions, Code):
350
+ {context}
351
+
352
+ {language_instruction}Please provide:
353
+ 1. A high-level technical `summary` of the project's purpose, architecture, functionalities and their responsibilities{lang_hint}. Use markdown formatting with **bold** and *italic* text to highlight important concepts.
354
+ 2. A list (`relationships`) describing the key interactions between these abstractions. For each relationship, specify:
355
+ - `from_abstraction`: Index of the source abstraction (e.g., `0 # AbstractionName1`)
356
+ - `to_abstraction`: Index of the target abstraction (e.g., `1 # AbstractionName2`)
357
+ - `label`: A brief label for the interaction **in just a few words**{lang_hint} (e.g., "Manages", "Inherits", "Uses").
358
+ Ideally the relationship should be backed by one abstraction calling or passing parameters to another.
359
+ Simplify the relationship and exclude those non-important ones.
360
+
361
+ IMPORTANT: Make sure EVERY abstraction is involved in at least ONE relationship (either as source or target). Each abstraction index must appear at least once across all relationships.
362
+
363
+ Format the output as YAML:
364
+
365
+ ```yaml
366
+ summary: |
367
+ A technical overview of the project architecture{lang_hint}.
368
+ Can span multiple lines with **bold** and *italic* for emphasis.
369
+ relationships:
370
+ - from_abstraction: 0 # AbstractionName1
371
+ to_abstraction: 1 # AbstractionName2
372
+ label: "Manages"{lang_hint}
373
+ - from_abstraction: 2 # AbstractionName3
374
+ to_abstraction: 0 # AbstractionName1
375
+ label: "Provides config"{lang_hint}
376
+ # ... other relationships
377
+ ```
378
+
379
+ Now, provide the YAML output:
380
+ """
381
+ response = call_llm(
382
+ prompt, use_cache=(use_cache and self.cur_retry == 0)
383
+ ) # Use cache only if enabled and not retrying
384
+
385
+ # --- Validation ---
386
+ yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip()
387
+ relationships_data = yaml.safe_load(yaml_str)
388
+
389
+ if not isinstance(relationships_data, dict) or not all(
390
+ k in relationships_data for k in ["summary", "relationships"]
391
+ ):
392
+ raise ValueError(
393
+ "LLM output is not a dict or missing keys ('summary', 'relationships')"
394
+ )
395
+ if not isinstance(relationships_data["summary"], str):
396
+ raise ValueError("summary is not a string")
397
+ if not isinstance(relationships_data["relationships"], list):
398
+ raise ValueError("relationships is not a list")
399
+
400
+ # Validate relationships structure
401
+ validated_relationships = []
402
+ for rel in relationships_data["relationships"]:
403
+ # Check for 'label' key
404
+ if not isinstance(rel, dict) or not all(
405
+ k in rel for k in ["from_abstraction", "to_abstraction", "label"]
406
+ ):
407
+ raise ValueError(
408
+ f"Missing keys (expected from_abstraction, to_abstraction, label) in relationship item: {rel}"
409
+ )
410
+ # Validate 'label' is a string
411
+ if not isinstance(rel["label"], str):
412
+ raise ValueError(f"Relationship label is not a string: {rel}")
413
+
414
+ # Validate indices
415
+ try:
416
+ from_idx = int(str(rel["from_abstraction"]).split("#")[0].strip())
417
+ to_idx = int(str(rel["to_abstraction"]).split("#")[0].strip())
418
+ if not (
419
+ 0 <= from_idx < num_abstractions and 0 <= to_idx < num_abstractions
420
+ ):
421
+ raise ValueError(
422
+ f"Invalid index in relationship: from={from_idx}, to={to_idx}. Max index is {num_abstractions-1}."
423
+ )
424
+ validated_relationships.append(
425
+ {
426
+ "from": from_idx,
427
+ "to": to_idx,
428
+ "label": rel["label"], # Potentially translated label
429
+ }
430
+ )
431
+ except (ValueError, TypeError):
432
+ raise ValueError(f"Could not parse indices from relationship: {rel}")
433
+
434
+ elapsed = time.time() - start_time
435
+ print_success("Generated project summary", elapsed, indent=2)
436
+
437
+ return {
438
+ "summary": relationships_data["summary"], # Potentially translated summary
439
+ "details": validated_relationships, # Store validated, index-based relationships with potentially translated labels
440
+ }
441
+
442
+ def post(self, shared, prep_res, exec_res):
443
+ # Structure is now {"summary": str, "details": [{"from": int, "to": int, "label": str}]}
444
+ # Summary and label might be translated
445
+ shared["relationships"] = exec_res
446
+
447
+
448
+ class OrderComponents(Node):
449
+ def prep(self, shared):
450
+ abstractions = shared["abstractions"] # Name/description might be translated
451
+ relationships = shared["relationships"] # Summary/label might be translated
452
+ project_name = shared["project_name"] # Get project name
453
+ language = shared.get("language", "english") # Get language
454
+ use_cache = shared.get("use_cache", True) # Get use_cache flag, default to True
455
+
456
+ # Prepare context for the LLM
457
+ abstraction_info_for_prompt = []
458
+ for i, a in enumerate(abstractions):
459
+ abstraction_info_for_prompt.append(
460
+ f"- {i} # {a['name']}"
461
+ ) # Use potentially translated name
462
+ abstraction_listing = "\n".join(abstraction_info_for_prompt)
463
+
464
+ # Use potentially translated summary and labels
465
+ summary_note = ""
466
+ if language.lower() != "english":
467
+ summary_note = (
468
+ f" (Note: Project Summary might be in {language.capitalize()})"
469
+ )
470
+
471
+ context = f"Project Summary{summary_note}:\n{relationships['summary']}\n\n"
472
+ context += "Relationships (Indices refer to abstractions above):\n"
473
+ for rel in relationships["details"]:
474
+ from_name = abstractions[rel["from"]]["name"]
475
+ to_name = abstractions[rel["to"]]["name"]
476
+ # Use potentially translated 'label'
477
+ context += f"- From {rel['from']} ({from_name}) to {rel['to']} ({to_name}): {rel['label']}\n" # Label might be translated
478
+
479
+ list_lang_note = ""
480
+ if language.lower() != "english":
481
+ list_lang_note = f" (Names might be in {language.capitalize()})"
482
+
483
+ return (
484
+ abstraction_listing,
485
+ context,
486
+ len(abstractions),
487
+ project_name,
488
+ list_lang_note,
489
+ use_cache,
490
+ ) # Return use_cache
491
+
492
+ def exec(self, prep_res):
493
+ start_time = time.time()
494
+ (
495
+ abstraction_listing,
496
+ context,
497
+ num_abstractions,
498
+ project_name,
499
+ list_lang_note,
500
+ use_cache,
501
+ ) = prep_res # Unpack use_cache
502
+
503
+ print_operation("Determining component order...", Icons.ORDERING, indent=1)
504
+ # No language variation needed here in prompt instructions, just ordering based on structure
505
+ # The input names might be translated, hence the note.
506
+ prompt = f"""
507
+ Given the following project abstractions and their relationships for the project ```` {project_name} ````:
508
+
509
+ Abstractions (Index # Name){list_lang_note}:
510
+ {abstraction_listing}
511
+
512
+ Context about relationships and project summary:
513
+ {context}
514
+
515
+ If you are going to create technical documentation for ```` {project_name} ````, what is the best order to document these components, from first to last?
516
+ Ideally, first document those that are the most important or foundational, perhaps user-facing concepts or entry points. Then move to more detailed, lower-level implementation details or supporting concepts.
517
+
518
+ Output the ordered list of abstraction indices, including the name in a comment for clarity. Use the format `idx # AbstractionName`.
519
+
520
+ ```yaml
521
+ - 2 # FoundationalConcept
522
+ - 0 # CoreClassA
523
+ - 1 # CoreClassB (uses CoreClassA)
524
+ - ...
525
+ ```
526
+
527
+ Now, provide the YAML output:
528
+ """
529
+ response = call_llm(
530
+ prompt, use_cache=(use_cache and self.cur_retry == 0)
531
+ ) # Use cache only if enabled and not retrying
532
+
533
+ # --- Validation ---
534
+ yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip()
535
+ ordered_indices_raw = yaml.safe_load(yaml_str)
536
+
537
+ if not isinstance(ordered_indices_raw, list):
538
+ raise ValueError("LLM output is not a list")
539
+
540
+ ordered_indices = []
541
+ seen_indices = set()
542
+ for entry in ordered_indices_raw:
543
+ try:
544
+ if isinstance(entry, int):
545
+ idx = entry
546
+ elif isinstance(entry, str) and "#" in entry:
547
+ idx = int(entry.split("#")[0].strip())
548
+ else:
549
+ idx = int(str(entry).strip())
550
+
551
+ if not (0 <= idx < num_abstractions):
552
+ raise ValueError(
553
+ f"Invalid index {idx} in ordered list. Max index is {num_abstractions-1}."
554
+ )
555
+ if idx in seen_indices:
556
+ raise ValueError(f"Duplicate index {idx} found in ordered list.")
557
+ ordered_indices.append(idx)
558
+ seen_indices.add(idx)
559
+
560
+ except (ValueError, TypeError):
561
+ raise ValueError(
562
+ f"Could not parse index from ordered list entry: {entry}"
563
+ )
564
+
565
+ # Check if all abstractions are included
566
+ if len(ordered_indices) != num_abstractions:
567
+ raise ValueError(
568
+ f"Ordered list length ({len(ordered_indices)}) does not match number of abstractions ({num_abstractions}). Missing indices: {set(range(num_abstractions)) - seen_indices}"
569
+ )
570
+
571
+ elapsed = time.time() - start_time
572
+ print_success(f"Order determined: {ordered_indices}", elapsed, indent=2)
573
+ print_phase_end()
574
+
575
+ return ordered_indices # Return the list of indices
576
+
577
+ def post(self, shared, prep_res, exec_res):
578
+ # exec_res is already the list of ordered indices
579
+ shared["component_order"] = exec_res # List of indices
580
+
581
+
582
+ class WriteComponents(BatchNode):
583
+ def prep(self, shared):
584
+ component_order = shared["component_order"] # List of indices
585
+ abstractions = shared[
586
+ "abstractions"
587
+ ] # List of {"name": str, "description": str, "files": [int]}
588
+ files_data = shared["files"] # List of (path, content) tuples
589
+ language = shared.get("language", "english")
590
+ use_cache = shared.get("use_cache", True) # Get use_cache flag, default to True
591
+ documentation_mode = shared.get(
592
+ "documentation_mode", "minimal"
593
+ ) # Get documentation_mode, default to minimal
594
+
595
+ # Get already written components to provide context
596
+ # We store them temporarily during the batch run, not in shared memory yet
597
+ # The 'previous_components_summary' will be built progressively in the exec context
598
+ self.components_written_so_far = (
599
+ []
600
+ ) # Use instance variable for temporary storage across exec calls
601
+
602
+ # Create a complete list of all components
603
+ all_components = []
604
+ component_filenames = {} # Store component filename mapping for linking
605
+ for i, abstraction_index in enumerate(component_order):
606
+ if 0 <= abstraction_index < len(abstractions):
607
+ component_num = i + 1
608
+ component_name = abstractions[abstraction_index][
609
+ "name"
610
+ ] # Potentially translated name
611
+ # Create safe filename (from potentially translated name)
612
+ safe_name = "".join(
613
+ c if c.isalnum() else "_" for c in component_name
614
+ ).lower()
615
+ filename = f"{i+1:02d}_{safe_name}.md"
616
+ # Format with link (using potentially translated name)
617
+ # Strip newlines from component name to prevent broken markdown links
618
+ clean_component_name = component_name.replace("\n", " ").strip()
619
+ all_components.append(
620
+ f"{component_num}. [{clean_component_name}]({filename})"
621
+ )
622
+ # Store mapping of component index to filename for linking
623
+ component_filenames[abstraction_index] = {
624
+ "num": component_num,
625
+ "name": component_name,
626
+ "filename": filename,
627
+ }
628
+
629
+ full_component_listing = "\n".join(all_components)
630
+
631
+ items_to_process = []
632
+ for i, abstraction_index in enumerate(component_order):
633
+ if 0 <= abstraction_index < len(abstractions):
634
+ abstraction_details = abstractions[
635
+ abstraction_index
636
+ ] # Contains potentially translated name/desc
637
+ # Use 'files' (list of indices) directly
638
+ related_file_indices = abstraction_details.get("files", [])
639
+ # Get content using helper, passing indices
640
+ related_files_content_map = get_content_for_indices(
641
+ files_data, related_file_indices
642
+ )
643
+
644
+ prev_component = None
645
+ if i > 0:
646
+ prev_idx = component_order[i - 1]
647
+ prev_component = component_filenames[prev_idx]
648
+
649
+ next_component = None
650
+ if i < len(component_order) - 1:
651
+ next_idx = component_order[i + 1]
652
+ next_component = component_filenames[next_idx]
653
+
654
+ items_to_process.append(
655
+ {
656
+ "component_num": i + 1,
657
+ "abstraction_index": abstraction_index,
658
+ "abstraction_details": abstraction_details, # Has potentially translated name/desc
659
+ "related_files_content_map": related_files_content_map,
660
+ "project_name": shared["project_name"], # Add project name
661
+ "full_component_listing": full_component_listing,
662
+ "component_filenames": component_filenames,
663
+ "prev_component": prev_component,
664
+ "next_component": next_component,
665
+ "language": language,
666
+ "use_cache": use_cache,
667
+ "documentation_mode": documentation_mode,
668
+ }
669
+ )
670
+ else:
671
+ print(
672
+ f"Warning: Invalid abstraction index {abstraction_index} in component_order. Skipping."
673
+ )
674
+
675
+ print_phase_start("Content Generation", Icons.WRITING)
676
+ return items_to_process # Iterable for BatchNode
677
+
678
+ def exec(self, item):
679
+ start_time = time.time()
680
+ # This runs for each item prepared above
681
+ abstraction_name = item["abstraction_details"][
682
+ "name"
683
+ ] # Potentially translated name
684
+ abstraction_description = item["abstraction_details"][
685
+ "description"
686
+ ] # Potentially translated description
687
+ component_num = item["component_num"]
688
+ project_name = item.get("project_name")
689
+ language = item.get("language", "english")
690
+ use_cache = item.get("use_cache", True) # Read use_cache from item
691
+ documentation_mode = item.get(
692
+ "documentation_mode", "minimal"
693
+ ) # Read documentation_mode from item
694
+
695
+ # Prepare file context string from the map
696
+ file_context_str = "\n\n".join(
697
+ f"--- File: {idx_path.split('# ')[1] if '# ' in idx_path else idx_path} ---\n{content}"
698
+ for idx_path, content in item["related_files_content_map"].items()
699
+ )
700
+
701
+ # Get summary of components written *before* this one
702
+ # Use the temporary instance variable
703
+ previous_components_summary = "\n---\n".join(self.components_written_so_far)
704
+
705
+ # Add language instruction and context notes only if not English
706
+ language_instruction = ""
707
+ concept_details_note = ""
708
+ structure_note = ""
709
+ prev_summary_note = ""
710
+ instruction_lang_note = ""
711
+ mermaid_lang_note = ""
712
+ code_comment_note = ""
713
+ link_lang_note = ""
714
+ tone_note = ""
715
+ if language.lower() != "english":
716
+ lang_cap = language.capitalize()
717
+ language_instruction = f"IMPORTANT: Write this ENTIRE documentation component in **{lang_cap}**. Some input context (like concept name, description, component list, previous summary) might already be in {lang_cap}, but you MUST translate ALL other generated content including explanations, examples, technical terms, and potentially code comments into {lang_cap}. DO NOT use English anywhere except in code syntax, required proper nouns, or when specified. The entire output MUST be in {lang_cap}.\n\n"
718
+ concept_details_note = f" (Note: Provided in {lang_cap})"
719
+ structure_note = f" (Note: Component names might be in {lang_cap})"
720
+ prev_summary_note = f" (Note: This summary might be in {lang_cap})"
721
+ instruction_lang_note = f" (in {lang_cap})"
722
+ mermaid_lang_note = f" (Use {lang_cap} for labels/text if appropriate)"
723
+ code_comment_note = f" (Translate to {lang_cap} if possible, otherwise keep minimal English for clarity)"
724
+ link_lang_note = (
725
+ f" (Use the {lang_cap} component title from the structure above)"
726
+ )
727
+ tone_note = f" (appropriate for {lang_cap} readers)"
728
+
729
+ # Build prompt based on mode
730
+ if documentation_mode == "minimal":
731
+ # Minimal mode: shorter, more direct instructions
732
+ prompt = f"""
733
+ {language_instruction}Write short and concise intent-focused documentation. Be brief but keep all critical info: architecture, design, components, integrations. Focus on key facts and intent. Avoid verbosity. Keep structure but be direct.
734
+
735
+ Write technical documentation (in Markdown format) for engineers working with the component "{abstraction_name}" in the project `{project_name}`. This is Component {component_num}.
736
+
737
+ Component/Concept Details{concept_details_note}:
738
+ - Name: {abstraction_name}
739
+ - Description:
740
+ {abstraction_description}
741
+
742
+ Complete Documentation Structure{structure_note}:
743
+ {item["full_component_listing"]}
744
+
745
+ Context from previous components{prev_summary_note}:
746
+ {previous_components_summary if previous_components_summary else "This is the first component."}
747
+
748
+ Relevant Code Snippets (Code itself remains unchanged):
749
+ {file_context_str if file_context_str else "No specific code snippets provided for this abstraction."}
750
+
751
+ Instructions for the documentation (Generate content in {language.capitalize()} unless specified otherwise):
752
+ - Start with clear heading: `# Component {component_num}: {abstraction_name}`. Use the provided component name.
753
+
754
+ - If not first component, reference previous component{instruction_lang_note} with Markdown link{link_lang_note}.
755
+
756
+ - Why it exists{instruction_lang_note}: core responsibilities, purpose in architecture.
757
+
758
+ - What it does{instruction_lang_note}: key responsibilities, how it works, integration points.
759
+
760
+ - Avoid code blocks if not critical. If code blocks are needed, keep them BELOW 5 lines. Simplify aggressively. Use comments{code_comment_note} to skip non-essential details. Explain after each block{instruction_lang_note}. No imports/packages.
761
+
762
+ - Internal implementation{instruction_lang_note}: step-by-step walkthrough (code-light). Use simple sequenceDiagram (max 5 participants). If participant name has space: `participant QP as Query Processing`. {mermaid_lang_note}.
763
+
764
+ - IMPORTANT: Link to other components: [Component Title](filename.md). Use Complete Documentation Structure for filename/title{link_lang_note}.
765
+
766
+ - Use mermaid diagrams for complex concepts (```mermaid``` format). {mermaid_lang_note}.
767
+
768
+ - Key takeaways{instruction_lang_note}: what it handles, common patterns, integration points. Link to next component if exists{link_lang_note}.
769
+
770
+ - Tone: technical and precise{tone_note}.
771
+
772
+ - Output *only* Markdown content (DONT NEED ```markdown``` tags).
773
+ """
774
+ else:
775
+ # Comprehensive mode: ORIGINAL prompt unchanged
776
+ prompt = f"""
777
+ {language_instruction}Write technical documentation (in Markdown format) for engineers working with the component "{abstraction_name}" in the project `{project_name}`. This is Component {component_num}.
778
+
779
+ Component/Concept Details{concept_details_note}:
780
+ - Name: {abstraction_name}
781
+ - Description:
782
+ {abstraction_description}
783
+
784
+ Complete Documentation Structure{structure_note}:
785
+ {item["full_component_listing"]}
786
+
787
+ Context from previous components{prev_summary_note}:
788
+ {previous_components_summary if previous_components_summary else "This is the first component."}
789
+
790
+ Relevant Code Snippets (Code itself remains unchanged):
791
+ {file_context_str if file_context_str else "No specific code snippets provided for this abstraction."}
792
+
793
+ Instructions for the documentation (Generate content in {language.capitalize()} unless specified otherwise):
794
+ - Start with a clear heading (e.g., `# Component {component_num}: {abstraction_name}`). Use the provided component name.
795
+
796
+ - If this is not the first component, begin with a brief reference to the previous component{instruction_lang_note}, linking to it with a proper Markdown link using its name{link_lang_note}.
797
+
798
+ - Begin with why this component exists{instruction_lang_note} - what problem it solves and its core responsibilities. Focus on the component's purpose in the system architecture.
799
+
800
+ - Document what this component does{instruction_lang_note} - its key responsibilities, how it works, and how it integrates with other components.
801
+
802
+ - If the component is complex, break it down into key concepts. Explain each concept with technical precision{instruction_lang_note}.
803
+
804
+ - Each code block should be BELOW 10 lines! If longer code blocks are needed, break them down into smaller pieces and walk through them one-by-one. Aggresively simplify the code to make it minimal. Use comments{code_comment_note} to skip non-important implementation details. Each code block should have a solid explanation right after it{instruction_lang_note}. Make sure you dont include Imports or packages in the code blocks, keep it focused on the key logic always.
805
+
806
+ - Describe the internal implementation to help understand what's under the hood{instruction_lang_note}. First provide a non-code or code-light walkthrough on what happens step-by-step when the abstraction is called{instruction_lang_note}. It's recommended to use a simple sequenceDiagram - keep it minimal with at most 5 participants to ensure clarity. If participant name has space, use: `participant QP as Query Processing`. {mermaid_lang_note}.
807
+
808
+ - Then dive deeper into code for the internal implementation with references to files. Provide example code blocks, but make them similarly simple and beginner-friendly. Dont include imports or packages in the code blocks. Explain{instruction_lang_note}.
809
+
810
+ - IMPORTANT: When you need to refer to other core components covered in other sections, ALWAYS use proper Markdown links like this: [Component Title](filename.md). Use the Complete Documentation Structure above to find the correct filename and the component title{link_lang_note}. Translate the surrounding text.
811
+
812
+ - Use mermaid diagrams to illustrate complex concepts (```mermaid``` format). {mermaid_lang_note}.
813
+
814
+ - Provide concrete code examples from the codebase showing actual usage and implementation patterns{instruction_lang_note}.
815
+
816
+ - End the component documentation with key takeaways{instruction_lang_note}: what this component handles, common usage patterns, and integration points. If there is a next component, use a proper Markdown link: [Next Component Title](next_component_filename){link_lang_note}.
817
+
818
+ - Ensure the tone is technical and precise{tone_note}.
819
+
820
+ - Output *only* the Markdown content for this component.
821
+
822
+ Now, directly provide technical Markdown documentation (DON'T need ```markdown``` tags):
823
+ """
824
+ component_content = call_llm(
825
+ prompt, use_cache=(use_cache and self.cur_retry == 0)
826
+ ) # Use cache only if enabled and not retrying
827
+
828
+ elapsed = time.time() - start_time
829
+
830
+ # Store timing for later summary
831
+ if not hasattr(self, "component_times"):
832
+ self.component_times = []
833
+ self.component_times.append(elapsed)
834
+
835
+ # Show the operation with timing
836
+ print_operation(
837
+ f"Component {component_num}: {abstraction_name}",
838
+ Icons.WRITING,
839
+ indent=1,
840
+ elapsed_time=elapsed,
841
+ )
842
+ # Basic validation/cleanup
843
+ actual_heading = f"# Component {component_num}: {abstraction_name}" # Use potentially translated name
844
+ if not component_content.strip().startswith(f"# Component {component_num}"):
845
+ # Add heading if missing or incorrect, trying to preserve content
846
+ lines = component_content.strip().split("\n")
847
+ if lines and lines[0].strip().startswith(
848
+ "#"
849
+ ): # If there's some heading, replace it
850
+ lines[0] = actual_heading
851
+ component_content = "\n".join(lines)
852
+ else: # Otherwise, prepend it
853
+ component_content = f"{actual_heading}\n\n{component_content}"
854
+
855
+ # Add the generated content to our temporary list for the next iteration's context
856
+ self.components_written_so_far.append(component_content)
857
+
858
+ return component_content # Return the Markdown string (potentially translated)
859
+
860
+ def post(self, shared, prep_res, exec_res_list):
861
+ # exec_res_list contains the generated Markdown for each component, in order
862
+ shared["components"] = exec_res_list
863
+
864
+ # Calculate total time
865
+ total_time = (
866
+ sum(self.component_times) if hasattr(self, "component_times") else 0
867
+ )
868
+ print_success(f"{len(exec_res_list)} components written", total_time, indent=1)
869
+ print_phase_end()
870
+
871
+ # Cleanup
872
+ if hasattr(self, "component_times"):
873
+ del self.component_times
874
+ if hasattr(self, "components_written_so_far"):
875
+ del self.components_written_so_far
876
+
877
+
878
+ class GenerateDocContent(Node):
879
+ def prep(self, shared):
880
+ project_name = shared["project_name"]
881
+ output_base_dir = shared.get("output_dir", "output") # Default output dir
882
+ output_path = output_base_dir
883
+ repo_url = shared.get("repo_url") # Get the repository URL
884
+
885
+ # Get potentially translated data
886
+ relationships_data = shared[
887
+ "relationships"
888
+ ] # {"summary": str, "details": [{"from": int, "to": int, "label": str}]} -> summary/label potentially translated
889
+ component_order = shared["component_order"] # indices
890
+ abstractions = shared[
891
+ "abstractions"
892
+ ] # list of dicts -> name/description potentially translated
893
+ components_content = shared[
894
+ "components"
895
+ ] # list of strings -> content potentially translated
896
+
897
+ return {
898
+ "project_name": project_name,
899
+ "output_path": output_path,
900
+ "repo_url": repo_url,
901
+ "relationships_data": relationships_data,
902
+ "component_order": component_order,
903
+ "abstractions": abstractions,
904
+ "components_content": components_content,
905
+ }
906
+
907
+ def _generate_combined_content(
908
+ self, project_name, index_content, components_content
909
+ ):
910
+ """Generate the combined documentation file content."""
911
+ from wikigen.utils.adjust_headings import (
912
+ adjust_heading_levels,
913
+ strip_attribution_footer,
914
+ )
915
+
916
+ # Start with H1 repo name
917
+ combined = f"# {project_name}\n\n"
918
+
919
+ # Add index content without attribution footer
920
+ index_without_attribution = strip_attribution_footer(index_content)
921
+ combined += index_without_attribution
922
+
923
+ # Add separator
924
+ combined += "\n\n---\n\n"
925
+
926
+ # Add each component with headings shifted down one level
927
+ for i, component_content in enumerate(components_content):
928
+ adjusted_component = adjust_heading_levels(component_content, shift=1)
929
+ combined += adjusted_component
930
+
931
+ # Add separator between components (except for the last one)
932
+ if i < len(components_content) - 1:
933
+ combined += "\n\n---\n\n"
934
+
935
+ # Add separator at the bottom
936
+ combined += (
937
+ "\n\n---\n\nWiki created by [WIKIGEN](https://github.com/usesalt/wikigen)\n"
938
+ )
939
+
940
+ return combined
941
+
942
+ def exec(self, prep_res):
943
+ start_time = time.time()
944
+ project_name = prep_res["project_name"]
945
+ output_path = prep_res["output_path"]
946
+ repo_url = prep_res["repo_url"]
947
+ relationships_data = prep_res["relationships_data"]
948
+ component_order = prep_res["component_order"]
949
+ abstractions = prep_res["abstractions"]
950
+ components_content = prep_res["components_content"]
951
+
952
+ print_phase_start("Documentation Assembly", Icons.GENERATING)
953
+
954
+ # --- Generate Mermaid Diagram ---
955
+ mermaid_lines = ["flowchart TD"]
956
+ # Add nodes for each abstraction using potentially translated names
957
+ for i, abstr in enumerate(abstractions):
958
+ node_id = f"A{i}"
959
+ # Use potentially translated name, sanitize for Mermaid ID and label
960
+ # Remove quotes and line breaks to avoid Mermaid syntax issues
961
+ sanitized_name = abstr["name"].replace('"', "").replace("\n", " ").strip()
962
+ node_label = sanitized_name
963
+ mermaid_lines.append(
964
+ f' {node_id}["{node_label}"]'
965
+ ) # Node label uses potentially translated name
966
+ # Add edges for relationships using potentially translated labels
967
+ for rel in relationships_data["details"]:
968
+ from_node_id = f"A{rel['from']}"
969
+ to_node_id = f"A{rel['to']}"
970
+ # Use potentially translated label, sanitize
971
+ edge_label = (
972
+ rel["label"].replace('"', "").replace("\n", " ")
973
+ ) # Basic sanitization
974
+ max_label_len = 30
975
+ if len(edge_label) > max_label_len:
976
+ edge_label = edge_label[: max_label_len - 3] + "..."
977
+ mermaid_lines.append(
978
+ f' {from_node_id} -- "{edge_label}" --> {to_node_id}'
979
+ ) # Edge label uses potentially translated label
980
+
981
+ mermaid_diagram = "\n".join(mermaid_lines)
982
+ # --- End Mermaid ---
983
+
984
+ # --- Prepare index.md content ---
985
+ index_content = f"{relationships_data['summary']}\n\n" # Use the potentially translated summary directly
986
+ # Keep fixed strings in English
987
+ index_content += f"**Source Repository:** [{repo_url}]({repo_url})\n\n"
988
+
989
+ # Add Mermaid diagram for relationships (diagram itself uses potentially translated names/labels)
990
+ index_content += "```mermaid\n"
991
+ index_content += mermaid_diagram + "\n"
992
+ index_content += "```\n\n"
993
+
994
+ # Keep fixed strings in English
995
+ index_content += "## Components\n\n"
996
+
997
+ component_files = []
998
+ # Generate component links based on the determined order, using potentially translated names
999
+ for i, abstraction_index in enumerate(component_order):
1000
+ # Ensure index is valid and we have content for it
1001
+ if 0 <= abstraction_index < len(abstractions) and i < len(
1002
+ components_content
1003
+ ):
1004
+ abstraction_name = abstractions[abstraction_index][
1005
+ "name"
1006
+ ] # Potentially translated name
1007
+ # Sanitize potentially translated name for filename
1008
+ safe_name = "".join(
1009
+ c if c.isalnum() else "_" for c in abstraction_name
1010
+ ).lower()
1011
+ filename = f"{i+1:02d}_{safe_name}.md"
1012
+ # Strip newlines from component name to prevent broken markdown links
1013
+ clean_abstraction_name = abstraction_name.replace("\n", " ").strip()
1014
+ index_content += f"{i+1}. [{clean_abstraction_name}]({filename})\n" # Use potentially translated name in link text
1015
+
1016
+ # Component content without attribution footer
1017
+ component_content = components_content[
1018
+ i
1019
+ ] # Potentially translated content
1020
+
1021
+ # Store filename and corresponding content
1022
+ component_files.append(
1023
+ {"filename": filename, "content": component_content}
1024
+ )
1025
+ else:
1026
+ print(
1027
+ f"Warning: Mismatch between component order, abstractions, or content at index {i} (abstraction index {abstraction_index}). Skipping file generation for this entry."
1028
+ )
1029
+
1030
+ # Add attribution to index content (using English fixed string)
1031
+ index_content += "\n\n---\n\nGenerated by [WIKIGEN](https://usesalt.co)"
1032
+
1033
+ # Generate combined content
1034
+ combined_content = self._generate_combined_content(
1035
+ project_name, index_content, components_content
1036
+ )
1037
+
1038
+ elapsed = time.time() - start_time
1039
+ print_success("Generated index and combined files", elapsed, indent=1)
1040
+ print_phase_end()
1041
+
1042
+ return {
1043
+ "project_name": project_name,
1044
+ "output_path": output_path,
1045
+ "index_content": index_content,
1046
+ "component_files": component_files,
1047
+ "combined_content": combined_content,
1048
+ }
1049
+
1050
+ def post(self, shared, prep_res, exec_res):
1051
+ shared["doc_content"] = exec_res # Store the content dict
1052
+
1053
+
1054
+ class WriteDocFiles(Node):
1055
+ def prep(self, shared):
1056
+ return shared["doc_content"]
1057
+
1058
+ def exec(self, doc_content):
1059
+ start_time = time.time()
1060
+ project_name = doc_content["project_name"]
1061
+ output_path = doc_content["output_path"]
1062
+ combined_content = doc_content["combined_content"]
1063
+
1064
+ print_phase_start("Writing Output Files", Icons.CREATING)
1065
+ # Rely on Node's built-in retry/fallback
1066
+ os.makedirs(output_path, exist_ok=True)
1067
+
1068
+ # Write combined file
1069
+ combined_filepath = os.path.join(output_path, f"{project_name}.md")
1070
+ with open(combined_filepath, "w", encoding="utf-8") as f:
1071
+ f.write(combined_content)
1072
+ print_operation(f"{Icons.SUCCESS} {project_name}.md", indent=1)
1073
+
1074
+ elapsed = time.time() - start_time
1075
+ print_success("Documentation file written", elapsed, indent=1)
1076
+
1077
+ return output_path # Return the final path
1078
+
1079
+ def post(self, shared, prep_res, exec_res):
1080
+ shared["final_output_dir"] = exec_res # Store the output path