ifcraftcorpus 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ifcraftcorpus/cli.py +54 -5
- ifcraftcorpus/embeddings.py +11 -7
- ifcraftcorpus/index.py +26 -4
- ifcraftcorpus/logging_utils.py +84 -0
- ifcraftcorpus/mcp_server.py +418 -22
- ifcraftcorpus/providers.py +4 -4
- ifcraftcorpus/search.py +60 -12
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/agent-design/agent_prompt_engineering.md +183 -9
- ifcraftcorpus-1.2.1.data/data/share/ifcraftcorpus/subagents/README.md +198 -0
- ifcraftcorpus-1.2.1.data/data/share/ifcraftcorpus/subagents/if_genre_consultant.md +257 -0
- ifcraftcorpus-1.2.1.data/data/share/ifcraftcorpus/subagents/if_platform_advisor.md +306 -0
- ifcraftcorpus-1.2.1.data/data/share/ifcraftcorpus/subagents/if_prose_writer.md +187 -0
- ifcraftcorpus-1.2.1.data/data/share/ifcraftcorpus/subagents/if_quality_reviewer.md +245 -0
- ifcraftcorpus-1.2.1.data/data/share/ifcraftcorpus/subagents/if_story_architect.md +162 -0
- ifcraftcorpus-1.2.1.data/data/share/ifcraftcorpus/subagents/if_world_curator.md +280 -0
- {ifcraftcorpus-1.1.0.dist-info → ifcraftcorpus-1.2.1.dist-info}/METADATA +18 -1
- ifcraftcorpus-1.2.1.dist-info/RECORD +67 -0
- ifcraftcorpus-1.1.0.dist-info/RECORD +0 -59
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/agent-design/multi_agent_patterns.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/audience-and-access/accessibility_guidelines.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/audience-and-access/audience_targeting.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/audience-and-access/localization_considerations.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/craft-foundations/audio_visual_integration.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/craft-foundations/collaborative_if_writing.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/craft-foundations/creative_workflow_pipeline.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/craft-foundations/diegetic_design.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/craft-foundations/idea_capture_and_hooks.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/craft-foundations/if_platform_tools.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/craft-foundations/player_analytics_metrics.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/craft-foundations/quality_standards_if.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/craft-foundations/research_and_verification.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/craft-foundations/testing_interactive_fiction.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/emotional-design/conflict_patterns.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/emotional-design/emotional_beats.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/game-design/mechanics_design_patterns.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/genre-conventions/children_and_ya_conventions.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/genre-conventions/fantasy_conventions.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/genre-conventions/historical_fiction.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/genre-conventions/horror_conventions.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/genre-conventions/mystery_conventions.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/genre-conventions/sci_fi_conventions.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/narrative-structure/branching_narrative_construction.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/narrative-structure/branching_narrative_craft.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/narrative-structure/endings_patterns.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/narrative-structure/episodic_serialized_if.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/narrative-structure/nonlinear_structure.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/narrative-structure/pacing_and_tension.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/narrative-structure/romance_and_relationships.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/narrative-structure/scene_structure_and_beats.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/narrative-structure/scene_transitions.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/prose-and-language/character_voice.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/prose-and-language/dialogue_craft.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/prose-and-language/exposition_techniques.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/prose-and-language/narrative_point_of_view.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/prose-and-language/prose_patterns.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/prose-and-language/subtext_and_implication.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/prose-and-language/voice_register_consistency.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/scope-and-planning/scope_and_length.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/world-and-setting/canon_management.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/world-and-setting/setting_as_character.md +0 -0
- {ifcraftcorpus-1.1.0.data → ifcraftcorpus-1.2.1.data}/data/share/ifcraftcorpus/corpus/world-and-setting/worldbuilding_patterns.md +0 -0
- {ifcraftcorpus-1.1.0.dist-info → ifcraftcorpus-1.2.1.dist-info}/WHEEL +0 -0
- {ifcraftcorpus-1.1.0.dist-info → ifcraftcorpus-1.2.1.dist-info}/entry_points.txt +0 -0
- {ifcraftcorpus-1.1.0.dist-info → ifcraftcorpus-1.2.1.dist-info}/licenses/LICENSE +0 -0
- {ifcraftcorpus-1.1.0.dist-info → ifcraftcorpus-1.2.1.dist-info}/licenses/LICENSE-CONTENT +0 -0
ifcraftcorpus/cli.py
CHANGED
|
@@ -17,8 +17,26 @@ from __future__ import annotations
|
|
|
17
17
|
|
|
18
18
|
import argparse
|
|
19
19
|
import json
|
|
20
|
+
import logging
|
|
20
21
|
import sys
|
|
21
22
|
from pathlib import Path
|
|
23
|
+
from typing import TYPE_CHECKING
|
|
24
|
+
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
from ifcraftcorpus.providers import EmbeddingProvider
|
|
27
|
+
|
|
28
|
+
from ifcraftcorpus.logging_utils import configure_logging
|
|
29
|
+
|
|
30
|
+
configure_logging()
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _truncate(value: str, limit: int = 120) -> str:
|
|
35
|
+
"""Shorten long log values to keep CLI logs readable."""
|
|
36
|
+
|
|
37
|
+
if len(value) <= limit:
|
|
38
|
+
return value
|
|
39
|
+
return f"{value[:limit]}..."
|
|
22
40
|
|
|
23
41
|
|
|
24
42
|
def cmd_info(args: argparse.Namespace) -> int:
|
|
@@ -26,12 +44,19 @@ def cmd_info(args: argparse.Namespace) -> int:
|
|
|
26
44
|
from ifcraftcorpus import Corpus, __version__
|
|
27
45
|
|
|
28
46
|
corpus = Corpus()
|
|
47
|
+
clusters = corpus.list_clusters()
|
|
48
|
+
logger.info(
|
|
49
|
+
"CLI info command: version=%s docs=%s clusters=%s",
|
|
50
|
+
__version__,
|
|
51
|
+
corpus.document_count(),
|
|
52
|
+
len(clusters),
|
|
53
|
+
)
|
|
29
54
|
|
|
30
55
|
print(f"\nIF Craft Corpus v{__version__}")
|
|
31
56
|
print(f"Documents: {corpus.document_count()}")
|
|
32
|
-
print(f"Clusters: {len(
|
|
57
|
+
print(f"Clusters: {len(clusters)}")
|
|
33
58
|
print("\nClusters:")
|
|
34
|
-
for cluster in
|
|
59
|
+
for cluster in clusters:
|
|
35
60
|
docs = [d for d in corpus.list_documents() if d["cluster"] == cluster]
|
|
36
61
|
print(f" {cluster}: {len(docs)} file(s)")
|
|
37
62
|
|
|
@@ -43,6 +68,12 @@ def cmd_search(args: argparse.Namespace) -> int:
|
|
|
43
68
|
from ifcraftcorpus import Corpus
|
|
44
69
|
|
|
45
70
|
corpus = Corpus()
|
|
71
|
+
logger.info(
|
|
72
|
+
"CLI search query=%r cluster=%s limit=%s",
|
|
73
|
+
_truncate(args.query),
|
|
74
|
+
args.cluster,
|
|
75
|
+
args.limit,
|
|
76
|
+
)
|
|
46
77
|
results = corpus.search(
|
|
47
78
|
args.query,
|
|
48
79
|
limit=args.limit,
|
|
@@ -51,6 +82,7 @@ def cmd_search(args: argparse.Namespace) -> int:
|
|
|
51
82
|
)
|
|
52
83
|
|
|
53
84
|
if not results:
|
|
85
|
+
logger.info("CLI search returned no matches")
|
|
54
86
|
print("No results found.")
|
|
55
87
|
return 0
|
|
56
88
|
|
|
@@ -65,6 +97,7 @@ def cmd_search(args: argparse.Namespace) -> int:
|
|
|
65
97
|
content += "..."
|
|
66
98
|
print(f" {content}")
|
|
67
99
|
|
|
100
|
+
logger.info("CLI search returned %s results", len(results))
|
|
68
101
|
return 0
|
|
69
102
|
|
|
70
103
|
|
|
@@ -77,6 +110,7 @@ def cmd_embeddings_status(args: argparse.Namespace) -> int:
|
|
|
77
110
|
get_embedding_provider,
|
|
78
111
|
)
|
|
79
112
|
|
|
113
|
+
logger.debug("CLI embeddings status requested")
|
|
80
114
|
print("\n=== Embedding Providers ===\n")
|
|
81
115
|
|
|
82
116
|
# Check each provider
|
|
@@ -129,7 +163,7 @@ def cmd_embeddings_build(args: argparse.Namespace) -> int:
|
|
|
129
163
|
)
|
|
130
164
|
|
|
131
165
|
# Get provider
|
|
132
|
-
provider = None
|
|
166
|
+
provider: EmbeddingProvider | None = None
|
|
133
167
|
if args.provider:
|
|
134
168
|
if args.provider == "ollama":
|
|
135
169
|
provider = OllamaEmbeddings(model=args.model, host=args.ollama_host)
|
|
@@ -152,12 +186,19 @@ def cmd_embeddings_build(args: argparse.Namespace) -> int:
|
|
|
152
186
|
print(f"Provider {provider.provider_name} is not available.", file=sys.stderr)
|
|
153
187
|
return 1
|
|
154
188
|
|
|
189
|
+
logger.info(
|
|
190
|
+
"CLI embeddings build provider=%s model=%s output=%s",
|
|
191
|
+
provider.provider_name,
|
|
192
|
+
provider.model,
|
|
193
|
+
args.output,
|
|
194
|
+
)
|
|
155
195
|
print(f"Using provider: {provider.provider_name}")
|
|
156
196
|
print(f"Model: {provider.model} ({provider.dimension}d)")
|
|
157
197
|
|
|
158
198
|
# Build embeddings
|
|
159
199
|
corpus = Corpus()
|
|
160
|
-
|
|
200
|
+
doc_total = corpus.document_count()
|
|
201
|
+
print(f"\nBuilding embeddings for {doc_total} documents...")
|
|
161
202
|
|
|
162
203
|
# Use the corpus's internal index
|
|
163
204
|
embedding_index = EmbeddingIndex(provider=provider)
|
|
@@ -214,6 +255,12 @@ def cmd_embeddings_build(args: argparse.Namespace) -> int:
|
|
|
214
255
|
output_path = Path(args.output)
|
|
215
256
|
embedding_index.save(output_path)
|
|
216
257
|
|
|
258
|
+
logger.info(
|
|
259
|
+
"CLI embeddings build completed docs=%s sections=%s output=%s",
|
|
260
|
+
doc_count,
|
|
261
|
+
section_count,
|
|
262
|
+
output_path,
|
|
263
|
+
)
|
|
217
264
|
print(f"\nDone! Embedded {section_count} sections from {doc_count} documents.")
|
|
218
265
|
print(f"Saved to: {output_path}")
|
|
219
266
|
|
|
@@ -273,7 +320,9 @@ def main() -> int:
|
|
|
273
320
|
emb_parser.print_help()
|
|
274
321
|
return 0
|
|
275
322
|
|
|
276
|
-
|
|
323
|
+
logger.debug("CLI command executed: %s", args.command)
|
|
324
|
+
result: int = args.func(args)
|
|
325
|
+
return result
|
|
277
326
|
|
|
278
327
|
|
|
279
328
|
if __name__ == "__main__":
|
ifcraftcorpus/embeddings.py
CHANGED
|
@@ -44,10 +44,13 @@ from __future__ import annotations
|
|
|
44
44
|
import json
|
|
45
45
|
import logging
|
|
46
46
|
from pathlib import Path
|
|
47
|
-
from typing import TYPE_CHECKING
|
|
47
|
+
from typing import TYPE_CHECKING, Any
|
|
48
48
|
|
|
49
49
|
import numpy as np
|
|
50
50
|
|
|
51
|
+
if TYPE_CHECKING:
|
|
52
|
+
from sentence_transformers import SentenceTransformer
|
|
53
|
+
|
|
51
54
|
if TYPE_CHECKING:
|
|
52
55
|
from ifcraftcorpus.index import CorpusIndex
|
|
53
56
|
from ifcraftcorpus.providers import EmbeddingProvider
|
|
@@ -107,7 +110,8 @@ class EmbeddingIndex:
|
|
|
107
110
|
"""
|
|
108
111
|
self._provider = provider
|
|
109
112
|
self._embeddings: np.ndarray | None = None
|
|
110
|
-
self._metadata: list[dict] = []
|
|
113
|
+
self._metadata: list[dict[str, Any]] = []
|
|
114
|
+
self._st_model: SentenceTransformer | None = None
|
|
111
115
|
|
|
112
116
|
# For backward compatibility / persistence
|
|
113
117
|
if provider:
|
|
@@ -117,7 +121,6 @@ class EmbeddingIndex:
|
|
|
117
121
|
self.model_name = model_name
|
|
118
122
|
self._provider_name = "sentence-transformers"
|
|
119
123
|
# Lazy-load sentence-transformers model
|
|
120
|
-
self._st_model = None
|
|
121
124
|
if not lazy_load:
|
|
122
125
|
self._load_st_model()
|
|
123
126
|
|
|
@@ -126,7 +129,7 @@ class EmbeddingIndex:
|
|
|
126
129
|
"""Get the provider name."""
|
|
127
130
|
return self._provider_name
|
|
128
131
|
|
|
129
|
-
def _load_st_model(self):
|
|
132
|
+
def _load_st_model(self) -> SentenceTransformer:
|
|
130
133
|
"""Load sentence-transformers model (fallback)."""
|
|
131
134
|
if self._st_model is None:
|
|
132
135
|
try:
|
|
@@ -148,12 +151,13 @@ class EmbeddingIndex:
|
|
|
148
151
|
else:
|
|
149
152
|
# Fallback to sentence-transformers
|
|
150
153
|
model = self._load_st_model()
|
|
151
|
-
|
|
154
|
+
embeddings = model.encode(texts, show_progress_bar=False, convert_to_numpy=True)
|
|
155
|
+
return np.asarray(embeddings)
|
|
152
156
|
|
|
153
157
|
def add_texts(
|
|
154
158
|
self,
|
|
155
159
|
texts: list[str],
|
|
156
|
-
metadata: list[dict],
|
|
160
|
+
metadata: list[dict[str, Any]],
|
|
157
161
|
) -> None:
|
|
158
162
|
"""Add texts with metadata to the index.
|
|
159
163
|
|
|
@@ -185,7 +189,7 @@ class EmbeddingIndex:
|
|
|
185
189
|
*,
|
|
186
190
|
top_k: int = 10,
|
|
187
191
|
cluster: str | None = None,
|
|
188
|
-
) -> list[tuple[dict, float]]:
|
|
192
|
+
) -> list[tuple[dict[str, Any], float]]:
|
|
189
193
|
"""Search for semantically similar texts.
|
|
190
194
|
|
|
191
195
|
Args:
|
ifcraftcorpus/index.py
CHANGED
|
@@ -48,10 +48,31 @@ from __future__ import annotations
|
|
|
48
48
|
import sqlite3
|
|
49
49
|
from dataclasses import dataclass
|
|
50
50
|
from pathlib import Path
|
|
51
|
+
from typing import Any
|
|
51
52
|
|
|
52
53
|
from ifcraftcorpus.parser import Document, parse_directory
|
|
53
54
|
|
|
54
55
|
|
|
56
|
+
def _sanitize_fts_query(query: str) -> str:
|
|
57
|
+
"""Sanitize a query string for the FTS5 MATCH clause.
|
|
58
|
+
|
|
59
|
+
This function replaces hyphens with spaces to prevent FTS5 from
|
|
60
|
+
interpreting them as the `NOT` operator. This is intended to correctly
|
|
61
|
+
handle natural language queries with hyphenated words, for example
|
|
62
|
+
transforming "haunted-house" into a search for "haunted house".
|
|
63
|
+
|
|
64
|
+
It also collapses any resulting multiple spaces into a single space.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
query: Raw query string from user input.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Sanitized query safe for FTS5 MATCH.
|
|
71
|
+
"""
|
|
72
|
+
# Replace hyphens and collapse whitespace in one go.
|
|
73
|
+
return " ".join(query.replace("-", " ").split())
|
|
74
|
+
|
|
75
|
+
|
|
55
76
|
@dataclass
|
|
56
77
|
class SearchResult:
|
|
57
78
|
"""A search result from the corpus FTS5 index.
|
|
@@ -379,8 +400,8 @@ class CorpusIndex:
|
|
|
379
400
|
... cluster="emotional-design",
|
|
380
401
|
... limit=5)
|
|
381
402
|
"""
|
|
382
|
-
# Build FTS5 query
|
|
383
|
-
fts_query = query
|
|
403
|
+
# Build FTS5 query - sanitize to handle special characters
|
|
404
|
+
fts_query = _sanitize_fts_query(query)
|
|
384
405
|
|
|
385
406
|
# Add cluster filter if specified
|
|
386
407
|
where_clause = ""
|
|
@@ -462,7 +483,7 @@ class CorpusIndex:
|
|
|
462
483
|
cursor = self.conn.execute("SELECT DISTINCT cluster FROM documents ORDER BY cluster")
|
|
463
484
|
return [row["cluster"] for row in cursor]
|
|
464
485
|
|
|
465
|
-
def get_document(self, name: str) -> dict | None:
|
|
486
|
+
def get_document(self, name: str) -> dict[str, Any] | None:
|
|
466
487
|
"""Get a document by name with all its sections.
|
|
467
488
|
|
|
468
489
|
Retrieves complete document data including metadata and all
|
|
@@ -535,7 +556,8 @@ class CorpusIndex:
|
|
|
535
556
|
Count of documents in the index.
|
|
536
557
|
"""
|
|
537
558
|
cursor = self.conn.execute("SELECT COUNT(*) FROM documents")
|
|
538
|
-
|
|
559
|
+
result = cursor.fetchone()
|
|
560
|
+
return int(result[0]) if result else 0
|
|
539
561
|
|
|
540
562
|
|
|
541
563
|
def build_index(corpus_dir: Path, output_path: Path) -> CorpusIndex:
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Shared logging helpers for the IF Craft Corpus codebase."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
import sys
|
|
8
|
+
from typing import Final
|
|
9
|
+
|
|
10
|
+
LOG_LEVEL_ENV: Final[str] = "LOG_LEVEL"
|
|
11
|
+
VERBOSE_ENV: Final[str] = "VERBOSE"
|
|
12
|
+
|
|
13
|
+
__all__ = ["configure_logging", "LOG_LEVEL_ENV", "VERBOSE_ENV"]
|
|
14
|
+
|
|
15
|
+
_TRUTHY_VALUES: Final[set[str]] = {"1", "true", "yes", "on"}
|
|
16
|
+
_configured: bool = False
|
|
17
|
+
_CHATTY_LOGGERS: Final[tuple[str, ...]] = (
|
|
18
|
+
"httpx",
|
|
19
|
+
"fakeredis",
|
|
20
|
+
"docket",
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _is_truthy(value: str | None) -> bool:
|
|
25
|
+
"""Return True if the string resembles a truthy flag."""
|
|
26
|
+
|
|
27
|
+
if value is None:
|
|
28
|
+
return False
|
|
29
|
+
return value.strip().lower() in _TRUTHY_VALUES
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _resolve_level(value: str | None) -> int | None:
|
|
33
|
+
"""Convert a logging level string (name or integer) to ``int``."""
|
|
34
|
+
|
|
35
|
+
if not value:
|
|
36
|
+
return None
|
|
37
|
+
candidate = value.strip()
|
|
38
|
+
if not candidate:
|
|
39
|
+
return None
|
|
40
|
+
if candidate.isdigit():
|
|
41
|
+
return int(candidate)
|
|
42
|
+
name = candidate.upper()
|
|
43
|
+
return getattr(logging, name, None)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def configure_logging(
|
|
47
|
+
*,
|
|
48
|
+
env_level: str = LOG_LEVEL_ENV,
|
|
49
|
+
env_verbose: str = VERBOSE_ENV,
|
|
50
|
+
fmt: str = "%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
|
51
|
+
) -> int | None:
|
|
52
|
+
"""Configure root logging when LOG_LEVEL/VERBOSE are set.
|
|
53
|
+
|
|
54
|
+
Returns the configured level when logging is enabled, ``None`` otherwise.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
global _configured
|
|
58
|
+
|
|
59
|
+
raw_level = os.getenv(env_level)
|
|
60
|
+
level = _resolve_level(raw_level)
|
|
61
|
+
verbose_flag = os.getenv(env_verbose)
|
|
62
|
+
|
|
63
|
+
if raw_level and level is None:
|
|
64
|
+
print(
|
|
65
|
+
f"ifcraftcorpus: unknown log level '{raw_level}', defaulting to INFO",
|
|
66
|
+
file=sys.stderr,
|
|
67
|
+
)
|
|
68
|
+
level = logging.INFO
|
|
69
|
+
|
|
70
|
+
if level is None and not _is_truthy(verbose_flag):
|
|
71
|
+
return None
|
|
72
|
+
|
|
73
|
+
if level is None:
|
|
74
|
+
level = logging.DEBUG
|
|
75
|
+
|
|
76
|
+
root = logging.getLogger()
|
|
77
|
+
if not (root.handlers and _configured):
|
|
78
|
+
logging.basicConfig(level=level, format=fmt, stream=sys.stderr)
|
|
79
|
+
_configured = True
|
|
80
|
+
root.setLevel(level)
|
|
81
|
+
|
|
82
|
+
for name in _CHATTY_LOGGERS:
|
|
83
|
+
logging.getLogger(name).setLevel(max(logging.WARNING, level))
|
|
84
|
+
return level
|