ppi-analyser 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. ppi_analyser-0.1.0/MANIFEST.in +4 -0
  2. ppi_analyser-0.1.0/PKG-INFO +100 -0
  3. ppi_analyser-0.1.0/README.md +49 -0
  4. ppi_analyser-0.1.0/ppi_analyser/__init__.py +0 -0
  5. ppi_analyser-0.1.0/ppi_analyser/analysis/__init__.py +0 -0
  6. ppi_analyser-0.1.0/ppi_analyser/analysis/analysis_cache.py +74 -0
  7. ppi_analyser-0.1.0/ppi_analyser/analysis/expansion.py +140 -0
  8. ppi_analyser-0.1.0/ppi_analyser/analysis/mistral_batch_pipeline.py +288 -0
  9. ppi_analyser-0.1.0/ppi_analyser/analysis/modifier_rules.yaml +27 -0
  10. ppi_analyser-0.1.0/ppi_analyser/analysis/modifiers.py +132 -0
  11. ppi_analyser-0.1.0/ppi_analyser/analysis/pipeline.py +422 -0
  12. ppi_analyser-0.1.0/ppi_analyser/analysis/position.py +96 -0
  13. ppi_analyser-0.1.0/ppi_analyser/analysis/prompts.py +206 -0
  14. ppi_analyser-0.1.0/ppi_analyser/analysis/results.py +121 -0
  15. ppi_analyser-0.1.0/ppi_analyser/analysis/sentence.py +443 -0
  16. ppi_analyser-0.1.0/ppi_analyser/analysis/system_prompts.txt +652 -0
  17. ppi_analyser-0.1.0/ppi_analyser/config.py +81 -0
  18. ppi_analyser-0.1.0/ppi_analyser/core.py +107 -0
  19. ppi_analyser-0.1.0/ppi_analyser/example_usage.py +25 -0
  20. ppi_analyser-0.1.0/ppi_analyser/exceptions.py +29 -0
  21. ppi_analyser-0.1.0/ppi_analyser/exporters/__init__.py +0 -0
  22. ppi_analyser-0.1.0/ppi_analyser/exporters/excel.py +582 -0
  23. ppi_analyser-0.1.0/ppi_analyser/exporters/pdf.py +442 -0
  24. ppi_analyser-0.1.0/ppi_analyser/logger.py +27 -0
  25. ppi_analyser-0.1.0/ppi_analyser/models/__init__.py +0 -0
  26. ppi_analyser-0.1.0/ppi_analyser/models/base.py +10 -0
  27. ppi_analyser-0.1.0/ppi_analyser/models/deepseek.py +19 -0
  28. ppi_analyser-0.1.0/ppi_analyser/models/dummy.py +7 -0
  29. ppi_analyser-0.1.0/ppi_analyser/models/factory.py +44 -0
  30. ppi_analyser-0.1.0/ppi_analyser/models/gemini.py +13 -0
  31. ppi_analyser-0.1.0/ppi_analyser/models/groq.py +21 -0
  32. ppi_analyser-0.1.0/ppi_analyser/models/mistral.py +199 -0
  33. ppi_analyser-0.1.0/ppi_analyser/models/no_model.py +92 -0
  34. ppi_analyser-0.1.0/ppi_analyser/models/ollama.py +23 -0
  35. ppi_analyser-0.1.0/ppi_analyser/preprocessing/__init__.py +0 -0
  36. ppi_analyser-0.1.0/ppi_analyser/preprocessing/conversation.py +130 -0
  37. ppi_analyser-0.1.0/ppi_analyser/preprocessing/detect_narration.py +158 -0
  38. ppi_analyser-0.1.0/ppi_analyser/preprocessing/segmentation.py +284 -0
  39. ppi_analyser-0.1.0/ppi_analyser/preprocessing/segmentation_cache.py +65 -0
  40. ppi_analyser-0.1.0/ppi_analyser/preprocessing/speakers.py +76 -0
  41. ppi_analyser-0.1.0/ppi_analyser/prompts_batch.txt +957 -0
  42. ppi_analyser-0.1.0/ppi_analyser/server.py +626 -0
  43. ppi_analyser-0.1.0/ppi_analyser/stanza/gunicorn_config.py +6 -0
  44. ppi_analyser-0.1.0/ppi_analyser/stanza/stanza_api.py +123 -0
  45. ppi_analyser-0.1.0/ppi_analyser/stanza/stanza_api_proxy.py +93 -0
  46. ppi_analyser-0.1.0/ppi_analyser/stanza/stanza_client.py +56 -0
  47. ppi_analyser-0.1.0/ppi_analyser/stanza/wsgi.py +5 -0
  48. ppi_analyser-0.1.0/ppi_analyser/state.py +38 -0
  49. ppi_analyser-0.1.0/ppi_analyser/test.py +189 -0
  50. ppi_analyser-0.1.0/ppi_analyser.egg-info/PKG-INFO +100 -0
  51. ppi_analyser-0.1.0/ppi_analyser.egg-info/SOURCES.txt +57 -0
  52. ppi_analyser-0.1.0/ppi_analyser.egg-info/dependency_links.txt +1 -0
  53. ppi_analyser-0.1.0/ppi_analyser.egg-info/entry_points.txt +2 -0
  54. ppi_analyser-0.1.0/ppi_analyser.egg-info/requires.txt +30 -0
  55. ppi_analyser-0.1.0/ppi_analyser.egg-info/top_level.txt +4 -0
  56. ppi_analyser-0.1.0/pyproject.toml +79 -0
  57. ppi_analyser-0.1.0/setup.cfg +4 -0
  58. ppi_analyser-0.1.0/windows_installer/docker-compose.yml +22 -0
  59. ppi_analyser-0.1.0/windows_installer/setup_ppi.py +475 -0
@@ -0,0 +1,4 @@
1
+ exclude .env
2
+ exclude .env.*
3
+ recursive-exclude * .env
4
+ recursive-exclude * .env.*
@@ -0,0 +1,100 @@
1
+ Metadata-Version: 2.4
2
+ Name: ppi-analyser
3
+ Version: 0.1.0
4
+ Summary: Analyse automatique de phrases préfabriquées d'interaction (PPI) — pipeline NLP + serveur FastAPI
5
+ Author: Youssef Zeroual
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/YoussefZeroual/ppi-analyser
8
+ Project-URL: Repository, https://github.com/YoussefZeroual/ppi-analyser
9
+ Project-URL: Issues, https://github.com/YoussefZeroual/ppi-analyser/issues
10
+ Keywords: nlp,phraseology,french,corpus-linguistics,fastapi,stanza
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
+ Classifier: Topic :: Text Processing :: Linguistic
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: License :: OSI Approved :: MIT License
19
+ Classifier: Operating System :: OS Independent
20
+ Requires-Python: >=3.11
21
+ Description-Content-Type: text/markdown
22
+ Requires-Dist: fastapi>=0.111.0
23
+ Requires-Dist: uvicorn[standard]>=0.29.0
24
+ Requires-Dist: python-multipart>=0.0.9
25
+ Requires-Dist: flask>=3.0.0
26
+ Requires-Dist: flask-cors>=4.0.0
27
+ Requires-Dist: torch>=2.3.1
28
+ Requires-Dist: stanza>=1.8.0
29
+ Requires-Dist: nltk>=3.8.1
30
+ Requires-Dist: mistralai>=1.0.0
31
+ Requires-Dist: openai>=1.30.0
32
+ Requires-Dist: ollama>=0.2.0
33
+ Requires-Dist: google-generativeai>=0.7.0
34
+ Requires-Dist: groq>=0.9.0
35
+ Requires-Dist: pandas>=2.2.0
36
+ Requires-Dist: openpyxl>=3.1.2
37
+ Requires-Dist: xlsxwriter>=3.2.0
38
+ Requires-Dist: reportlab>=4.1.0
39
+ Requires-Dist: markdown
40
+ Requires-Dist: weasyprint
41
+ Requires-Dist: python-dotenv>=1.0.0
42
+ Requires-Dist: pyyaml>=6.0.1
43
+ Requires-Dist: requests>=2.31.0
44
+ Requires-Dist: httpx>=0.27.0
45
+ Provides-Extra: dev
46
+ Requires-Dist: pytest; extra == "dev"
47
+ Requires-Dist: mypy; extra == "dev"
48
+ Requires-Dist: ruff; extra == "dev"
49
+ Requires-Dist: build; extra == "dev"
50
+ Requires-Dist: twine; extra == "dev"
51
+
52
+ # PPI Analyser
53
+
54
+ Outil d'analyse automatique de phrases préfabriquées d'interaction (PPI) françaises.
55
+
56
+ ## Installation
57
+ ```bash
58
+ python -m venv myenv
59
+ source myenv/bin/activate
60
+ pip install -e .
61
+ ```
62
+
63
+ ## Configuration
64
+
65
+ Copy `.env.example` to `.env` and fill in your API keys:
66
+ ```bash
67
+ cp .env.example .env
68
+ ```
69
+
70
+ ## Usage
71
+ ```python
72
+ from ppi_analyser.core import PPIAnalyser
73
+ from ppi_analyser.config import PipelineConfig, AnalysisMode
74
+
75
+ analyser = PPIAnalyser()
76
+ config = PipelineConfig(
77
+ models=["mistral_mistral-medium-latest"],
78
+ expression="je t'en prie",
79
+ sentence_file="path/to/corpus.xlsx",
80
+ mode=AnalysisMode.ORAL,
81
+ output_dir="path/to/output",
82
+ )
83
+ df, state = analyser.process_sentences(config)
84
+ ```
85
+
86
+ ## Modes
87
+
88
+ - `ORAL` — corpus oral
89
+ - `ECRIT` — corpus écrit (segmentation automatique)
90
+ - `ECRIT_IA` — corpus écrit avec détection des tours de parole par LLM
91
+ - `ECRIT_TEST` — mode test rapide
92
+
93
+ ## Models supported
94
+
95
+ - Mistral (`mistral_mistral-medium-latest`)
96
+ - Ollama (`ollama_mistral:7b`)
97
+ - Groq (`groq_moonshotai/kimi-k2-instruct`)
98
+ - DeepSeek (`deepseek_deepseek-chat`)
99
+ - Gemini (`gemini_gemini-3-flash-preview`)
100
+ # ppi_analyser
@@ -0,0 +1,49 @@
1
+ # PPI Analyser
2
+
3
+ Outil d'analyse automatique de phrases préfabriquées d'interaction (PPI) françaises.
4
+
5
+ ## Installation
6
+ ```bash
7
+ python -m venv myenv
8
+ source myenv/bin/activate
9
+ pip install -e .
10
+ ```
11
+
12
+ ## Configuration
13
+
14
+ Copy `.env.example` to `.env` and fill in your API keys:
15
+ ```bash
16
+ cp .env.example .env
17
+ ```
18
+
19
+ ## Usage
20
+ ```python
21
+ from ppi_analyser.core import PPIAnalyser
22
+ from ppi_analyser.config import PipelineConfig, AnalysisMode
23
+
24
+ analyser = PPIAnalyser()
25
+ config = PipelineConfig(
26
+ models=["mistral_mistral-medium-latest"],
27
+ expression="je t'en prie",
28
+ sentence_file="path/to/corpus.xlsx",
29
+ mode=AnalysisMode.ORAL,
30
+ output_dir="path/to/output",
31
+ )
32
+ df, state = analyser.process_sentences(config)
33
+ ```
34
+
35
+ ## Modes
36
+
37
+ - `ORAL` — corpus oral
38
+ - `ECRIT` — corpus écrit (segmentation automatique)
39
+ - `ECRIT_IA` — corpus écrit avec détection des tours de parole par LLM
40
+ - `ECRIT_TEST` — mode test rapide
41
+
42
+ ## Models supported
43
+
44
+ - Mistral (`mistral_mistral-medium-latest`)
45
+ - Ollama (`ollama_mistral:7b`)
46
+ - Groq (`groq_moonshotai/kimi-k2-instruct`)
47
+ - DeepSeek (`deepseek_deepseek-chat`)
48
+ - Gemini (`gemini_gemini-3-flash-preview`)
49
+ # ppi_analyser
File without changes
File without changes
@@ -0,0 +1,74 @@
1
+ # analysis/analysis_cache.py
2
+
3
+ import json
4
+ import hashlib
5
+ import logging
6
+ from pathlib import Path
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ _cache: dict | None = None
11
+ _cache_path: Path | None = None
12
+
13
+
14
+ def init(cache_path: str) -> None:
15
+ """Call once at the start of a run to set the cache path and load existing entries."""
16
+ global _cache, _cache_path
17
+ _cache_path = Path(cache_path)
18
+ if _cache_path.exists():
19
+ try:
20
+ _cache = json.loads(_cache_path.read_text(encoding="utf-8"))
21
+ logger.info("Analysis cache loaded: %d entries from %s", len(_cache), _cache_path)
22
+ except Exception as e:
23
+ logger.warning("Could not load analysis cache: %s — starting fresh", e)
24
+ _cache = {}
25
+ else:
26
+ _cache = {}
27
+
28
+
29
+ def _require_init() -> None:
30
+ if _cache is None:
31
+ raise RuntimeError(
32
+ "Analysis cache not initialised — call analysis_cache.init(path) first, "
33
+ "or set use_analysis_cache=True and analysis_cache_path in PipelineConfig."
34
+ )
35
+
36
+
37
+ def _key(conversation: str, expression: str, model: str, submodel: str, prompt_type: str) -> str:
38
+ payload = f"{conversation}|{expression}|{model}|{submodel}|{prompt_type}"
39
+ return hashlib.md5(payload.encode("utf-8")).hexdigest()
40
+
41
+
42
+ def get(conversation: str, expression: str, model: str, submodel: str, prompt_type: str) -> str | None:
43
+ _require_init()
44
+ result = _cache.get(_key(conversation, expression, model, submodel, prompt_type))
45
+ if result is not None:
46
+ logger.debug("Analysis cache HIT — %s / %s / %s", prompt_type, model, expression[:40])
47
+ return result
48
+
49
+
50
+ def set(conversation: str, expression: str, model: str, submodel: str, prompt_type: str, result: str) -> None:
51
+ _require_init()
52
+ _cache[_key(conversation, expression, model, submodel, prompt_type)] = result
53
+ _save()
54
+
55
+
56
+ def _save() -> None:
57
+ try:
58
+ _cache_path.parent.mkdir(parents=True, exist_ok=True)
59
+ _cache_path.write_text(json.dumps(_cache, ensure_ascii=False, indent=2), encoding="utf-8")
60
+ except Exception as e:
61
+ logger.warning("Could not save analysis cache: %s", e)
62
+
63
+
64
+ def cache_size() -> int:
65
+ _require_init()
66
+ return len(_cache)
67
+
68
+
69
+ def clear() -> None:
70
+ global _cache
71
+ _require_init()
72
+ _cache = {}
73
+ _save()
74
+ logger.info("Analysis cache cleared")
@@ -0,0 +1,140 @@
1
+ import sys
2
+ import re
3
+ import pandas as pd
4
+ from ppi_analyser.stanza.stanza_client import StanzaClient
5
+ from ppi_analyser.exporters.excel import format_ppi_bold
6
+
7
+ client = StanzaClient()
8
+
9
+
10
+ def extract_ppi_sentence(tagged_line):
11
+ match = re.search(r'<PPI>(.*?)</PPI>', tagged_line, re.IGNORECASE)
12
+ if not match:
13
+ return None, None
14
+ ppi_text = match.group(1).strip()
15
+ # Trouve la limite gauche : dernier séparateur avant la balise
16
+ #pre = tagged_line[:match.start()]
17
+ post = tagged_line[match.end():]
18
+ # Coupe à gauche sur / ou début de ligne
19
+ #left = re.split(r'/', pre)[-1]
20
+ # Coupe à droite sur /
21
+ right = re.split(r'/', post)[0]
22
+ clean_seg = re.sub(r'</?PPI>', '', right, flags=re.IGNORECASE).strip() # <-- removed left be cause exp is always in the right
23
+ return ppi_text, clean_seg
24
+
25
+ def get_ppi_ids(sentence, ppi_text):
26
+ words = sentence.words
27
+ ppi_clean = re.sub(r'\s*-\s*', '-', ppi_text.lower()).strip()
28
+ for i in range(len(words)):
29
+ for j in range(i+1, len(words)+1):
30
+ window = words[i:j]
31
+ surface = re.sub(r'\s*-\s*', '-', " ".join(w.text for w in window).lower())
32
+ if surface == ppi_clean:
33
+ return set(w.id for w in window)
34
+ return set()
35
+
36
+
37
+ def get_ppi_head(sentence, ppi_ids):
38
+ for w in sentence.words:
39
+ if w.id in ppi_ids and w.head not in ppi_ids:
40
+ return w
41
+ return None
42
+
43
+
44
+ def get_subtree(head_word, words, exclude_ids=set()):
45
+ subtree_ids = {head_word.id}
46
+ changed = True
47
+ while changed:
48
+ changed = False
49
+ for w in words:
50
+ if w.head in subtree_ids and w.id not in subtree_ids and w.id not in exclude_ids:
51
+ subtree_ids.add(w.id)
52
+ changed = True
53
+ return sorted([w for w in words if w.id in subtree_ids], key=lambda w: w.id)
54
+
55
+
56
+ def get_expansion_from_sentence(sentence, ppi_text):
57
+ import logging
58
+ logger = logging.getLogger(__name__)
59
+
60
+ #logger.warning("%s",[f"{w.text}_{w.upos}:{w.deprel}" for w in sentence.words])
61
+ ppi_ids = get_ppi_ids(sentence, ppi_text)
62
+ if not ppi_ids:
63
+ return [{"type": None, "tokens": []}]
64
+ ppi_head = get_ppi_head(sentence, ppi_ids)
65
+ if not ppi_head:
66
+ return [{"type": None, "tokens": []}]
67
+ words = sentence.words
68
+ dependants = [w for w in words if w.head == ppi_head.id and w.id not in ppi_ids]
69
+ expansions = []
70
+ #logger.warning("%s",[f"{dep.text}_{dep.deprel}_{dep.upos}" for dep in dependants])
71
+ for dep in dependants:
72
+ deprel = dep.deprel
73
+ upos = dep.upos
74
+ if deprel == "xcomp" and upos == "VERB":
75
+ subtree = get_subtree(dep, words, exclude_ids=ppi_ids)
76
+ expansions.append({"type": "infinitive", "tokens": subtree})
77
+ elif deprel in ("ccomp", "csubj"):
78
+ subtree = get_subtree(dep, words, exclude_ids=ppi_ids)
79
+ expansions.append({"type": "completive_que", "tokens": subtree})
80
+ elif deprel in ("nmod", "obl", "obl:arg", "obj","advcl") and upos in ("NOUN", "PRON", "VERB"):
81
+ subtree = get_subtree(dep, words, exclude_ids=ppi_ids)
82
+ expansions.append({"type": "nominal_prep", "tokens": subtree})
83
+ return expansions[:1] if expansions else [{"type": None, "tokens": []}]
84
+
85
+
86
+ def detect_expansion(doc, ppi_text, occurrence=0):
87
+ count = 0
88
+ for sentence in doc.sentences:
89
+ result = get_expansion_from_sentence(sentence, ppi_text)
90
+ if result[0]["type"] is not None or result[0]["tokens"]:
91
+ if count == occurrence:
92
+ return result
93
+ count += 1
94
+ return [{"type": None, "tokens": []}]
95
+
96
+
97
+ def process_file(input_path):
98
+ if input_path.endswith(".csv"):
99
+ df_in = pd.read_csv(input_path)
100
+ else:
101
+ df_in = pd.read_excel(input_path)
102
+ print(df_in.columns.tolist())
103
+ print(df_in.head(2))
104
+ rows = []
105
+ for _, row in df_in.iterrows():
106
+ conversation = str(row.get("Conversation", ""))
107
+ lines = conversation.split("\n")
108
+ for line in lines:
109
+ if not re.search(r'<PPI>', line, re.IGNORECASE):
110
+ continue
111
+ ppi_text, clean_seg = extract_ppi_sentence(line)
112
+ if not ppi_text:
113
+ continue
114
+ expansions = detect_expansion(clean_seg, ppi_text)
115
+ exp = expansions[0]
116
+ expansion_text = " ".join(w["text"] for w in exp["tokens"]) if exp["tokens"] else ""
117
+ print(f"Tour : {line.strip()}")
118
+ print(f" Type : {exp['type']}")
119
+ print(f" Expansion: {expansion_text}\n")
120
+ rows.append({
121
+ "Tour": line.strip(),
122
+ "PPI": ppi_text,
123
+ "Type_expansion_1": exp["type"] if len(expansions) > 0 else "",
124
+ "Expansion_1": expansion_text,
125
+ "Type_expansion_2": expansions[1]["type"] if len(expansions) > 1 else "",
126
+ "Expansion_2": " ".join(w["text"] for w in expansions[1]["tokens"]) if len(expansions) > 1 and expansions[1]["tokens"] else "",
127
+ })
128
+
129
+ df_out = pd.DataFrame(rows)
130
+ base = re.sub(r'\.(xlsx|csv)$', '', input_path)
131
+ output_path = f"{base}_expansion.xlsx"
132
+ format_ppi_bold(df_out, output_path)
133
+ print(f"Résultat enregistré : {output_path}")
134
+
135
+
136
+ if __name__ == "__main__":
137
+ if len(sys.argv) < 2:
138
+ print("Usage: python expansion.py <fichier.csv|xlsx>")
139
+ sys.exit(1)
140
+ process_file(sys.argv[1])
@@ -0,0 +1,288 @@
1
+ # analysis/mistral_batch_pipeline.py
2
+
3
+ import json
4
+ import logging
5
+ from pathlib import Path
6
+
7
+ from ppi_analyser.analysis.pipeline import PreprocessedSentence, _chunk
8
+ from ppi_analyser.analysis.prompts import get_prompts_batch, get_prompt_type
9
+ from ppi_analyser.analysis.sentence import _handle_no_model_batch, _parse_batch_response
10
+ from ppi_analyser.models.factory import get_mistral_batch_provider
11
+ from ppi_analyser.config import PipelineConfig
12
+ from ppi_analyser.state import SessionState
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ #NON_IA = dict(state.no_ia)#{0, 1, 5, 7,8} # Forme, Lemme, Position, expansion — handled locally, not submitted to Mistral
17
+
18
+
19
+ def _custom_id(chunk_idx: int, prop_idx: int) -> str:
20
+ return f"c{chunk_idx}_p{prop_idx}"
21
+
22
+
23
+ # ---------------------------------------------------------------------------
24
+ # Main entry point
25
+ # ---------------------------------------------------------------------------
26
+
27
+ def analyse_batch_mistral_async(
28
+ preprocessed: list[PreprocessedSentence],
29
+ lemmes: list[str] | None,
30
+ config: PipelineConfig,
31
+ state: SessionState,
32
+ ) -> tuple[list[PreprocessedSentence], list[list[str]]]:
33
+ NON_IA = set(state.no_ia)
34
+
35
+ submodel = _resolve_submodel(config.models)
36
+ provider = get_mistral_batch_provider(submodel, config.output_dir)
37
+
38
+ # Check for saved job state from a previous interrupted run
39
+ saved = provider.load_job_state()
40
+ if saved:
41
+ logger.info("Resuming Mistral batch jobs from saved state")
42
+ all_results = _poll_and_assemble(saved["job_map"], provider, preprocessed, lemmes, config, state)
43
+ if all_results is None:
44
+ raise InterruptedError("Mistral batch jobs still running. Re-run to resume.")
45
+ provider.clear_job_state()
46
+ return preprocessed, all_results
47
+
48
+ # Save preprocessed so we can reassemble on resume
49
+ preprocessed_path = Path(config.output_dir) / "mistral_batch_preprocessed.json"
50
+ _save_preprocessed(preprocessed, lemmes, preprocessed_path)
51
+
52
+ chunks = _chunk(preprocessed, config.batch_size)
53
+ lemme_chunks = _chunk(lemmes, config.batch_size) if lemmes else [None] * len(chunks)
54
+
55
+ # Submit one job per (chunk × property), skip NON_IA
56
+ job_map = {} # "c{i}_p{j}" -> job_id
57
+ cached_results = {} # "c{i}_p{j}" -> raw_response (from analysis cache)
58
+ prompt_map = {} # "c{i}_p{j}" -> {user, prompt_type} for cache saving
59
+
60
+ for chunk_idx, (chunk, lemme_chunk) in enumerate(zip(chunks, lemme_chunks)):
61
+ expression = lemme_chunk[0] if lemme_chunk else config.expression
62
+ if lemme_chunk:
63
+ state.expression_list.extend(lemme_chunk)
64
+
65
+ system_prompts, batched_prompts = get_prompts_batch(
66
+ expression=expression,
67
+ forme_relevee_list=[s.forme_relevee for s in chunk],
68
+ conv_list=[s.cleaned for s in chunk],
69
+ locuteur_list=[s.locuteur for s in chunk],
70
+ interlocuteur_list=[s.interlocuteurs for s in chunk],
71
+ mode=config.mode,
72
+ )
73
+
74
+ for prop_idx, (system_prompt, batched_prompt) in enumerate(
75
+ zip(system_prompts, batched_prompts)
76
+ ):
77
+ if prop_idx in NON_IA:
78
+ continue
79
+ prompt_type = get_prompt_type(system_prompt)
80
+ if config.properties and prompt_type not in config.properties:
81
+ continue
82
+ if state.custom_properties_list is not None and prompt_type not in state.custom_properties_list: # fixed added custom props
83
+ continue
84
+
85
+ cid = _custom_id(chunk_idx, prop_idx)
86
+
87
+ # Check analysis cache before submitting
88
+ if state.use_analysis_cache:
89
+ from ppi_analyser.analysis.analysis_cache import get as acache_get
90
+ cached = acache_get(batched_prompt, "", "mistral_batch", submodel, prompt_type)
91
+ if cached is not None:
92
+ logger.debug("Analysis cache HIT for %s chunk %d prop %d", prompt_type, chunk_idx, prop_idx)
93
+ cached_results[cid] = cached
94
+ continue
95
+
96
+ job_id = provider.submit([{
97
+ "custom_id": cid,
98
+ "system": system_prompt,
99
+ "user": batched_prompt,
100
+ }])
101
+ job_map[cid] = job_id
102
+ prompt_map[cid] = {"user": batched_prompt, "prompt_type": prompt_type}
103
+ logger.info("Submitted job %s for chunk %d prop %d", job_id, chunk_idx, prop_idx)
104
+
105
+ # Poll all jobs
106
+ all_results = _poll_and_assemble(job_map, provider, preprocessed, lemmes, config, state,
107
+ preprocessed_path=str(preprocessed_path),
108
+ cached_results=cached_results,
109
+ submodel=submodel,
110
+ prompt_map=prompt_map)
111
+ if all_results is None:
112
+ raise InterruptedError("Mistral batch jobs still running. Re-run to resume.")
113
+
114
+ provider.clear_job_state()
115
+ return preprocessed, all_results
116
+
117
+
118
+ # ---------------------------------------------------------------------------
119
+ # Poll + assemble
120
+ # ---------------------------------------------------------------------------
121
+
122
+ def _poll_and_assemble(
123
+ job_map: dict,
124
+ provider,
125
+ preprocessed: list[PreprocessedSentence],
126
+ lemmes: list[str] | None,
127
+ config: PipelineConfig,
128
+ state: SessionState,
129
+ preprocessed_path: str | None = None,
130
+ cached_results: dict | None = None,
131
+ submodel: str = "",
132
+ prompt_map: dict | None = None,
133
+ ) -> list[list[str]] | None:
134
+ """
135
+ Poll all jobs in job_map. Returns assembled results or None if any timed out.
136
+ job_map: {"c{i}_p{j}": job_id}
137
+ """
138
+ raw_results = dict(cached_results) if cached_results else {}
139
+ timed_out = []
140
+
141
+ for cid, job_id in job_map.items():
142
+ result = provider.poll_or_save(job_id, preprocessed_json=preprocessed_path)
143
+ if result is None:
144
+ timed_out.append(cid)
145
+ else:
146
+ raw_response = result.get(cid, "")
147
+ raw_results[cid] = raw_response
148
+ # Save to analysis cache using prompt_map for key info
149
+ if state.use_analysis_cache and raw_response and prompt_map and cid in prompt_map:
150
+ from ppi_analyser.analysis.analysis_cache import set as acache_set
151
+ entry = prompt_map[cid]
152
+ acache_set(entry["user"], "", "mistral_batch", submodel,
153
+ entry["prompt_type"], raw_response)
154
+ logger.debug("Analysis cache SET for %s", entry["prompt_type"])
155
+
156
+ if timed_out:
157
+ combined = {"job_map": job_map, "preprocessed_json": preprocessed_path}
158
+ state_path = Path(config.output_dir) / "mistral_batch_job.json"
159
+ state_path.write_text(json.dumps(combined, indent=2), encoding="utf-8")
160
+ logger.warning("%d jobs timed out. State saved to %s", len(timed_out), state_path)
161
+ return None
162
+
163
+ return _assemble(raw_results, preprocessed, lemmes, config, state)
164
+
165
+
166
+
167
+ # ---------------------------------------------------------------------------
168
+ # Assemble
169
+ # ---------------------------------------------------------------------------
170
+
171
+ def _assemble(
172
+ raw_results: dict,
173
+ preprocessed: list[PreprocessedSentence],
174
+ lemmes: list[str] | None,
175
+ config: PipelineConfig,
176
+ state: SessionState,
177
+ ) -> list[list[str]]:
178
+ """
179
+ Reassemble per-sentence results from raw_results.
180
+ raw_results: {"c{chunk}_p{prop}": raw_response_string}
181
+ Returns results_per_sentence[sent][prop].
182
+ """
183
+ chunks = _chunk(preprocessed, config.batch_size)
184
+ lemme_chunks = _chunk(lemmes, config.batch_size) if lemmes else [None] * len(chunks)
185
+ all_results = []
186
+ from ppi_analyser.config import AnalysisMode
187
+ if state.analysis_mode ==AnalysisMode.ORAL:
188
+ NON_IA = [0,1,5]
189
+ else:
190
+ NON_IA = set(state.no_ia)
191
+ for chunk_idx, (chunk, lemme_chunk) in enumerate(zip(chunks, lemme_chunks)):
192
+ n_sents = len(chunk)
193
+ expression = lemme_chunk[0] if lemme_chunk else config.expression
194
+
195
+ system_prompts, _ = get_prompts_batch(
196
+ expression=expression,
197
+ forme_relevee_list=[s.forme_relevee for s in chunk],
198
+ conv_list=[s.cleaned for s in chunk],
199
+ locuteur_list=[s.locuteur for s in chunk],
200
+ interlocuteur_list=[s.interlocuteurs for s in chunk],
201
+ mode=config.mode,
202
+ )
203
+
204
+ n_props = len(system_prompts)
205
+ results_per_property = []
206
+
207
+ for prop_idx, system_prompt in enumerate(system_prompts):
208
+ if prop_idx in NON_IA:
209
+ prop_results = _handle_no_model_batch(
210
+ system_prompt=system_prompt,
211
+ conversations=[s.cleaned for s in chunk],
212
+ expression=expression,
213
+ forme_relevee_list=[s.forme_relevee for s in chunk],
214
+ state=state,
215
+ mode=config.mode,
216
+ n_sentences=n_sents,
217
+ start_offset=chunk_idx * config.batch_size
218
+ )
219
+ else:
220
+ cid = _custom_id(chunk_idx, prop_idx)
221
+ prompt_type = get_prompt_type(system_prompt)
222
+ if config.properties and prompt_type not in config.properties:
223
+ prop_results = [None] * n_sents
224
+ elif state.custom_properties_list is not None and prompt_type not in state.custom_properties_list:
225
+ prop_results = [None] * n_sents
226
+ else:
227
+ raw_response = raw_results.get(cid, "")
228
+ if not raw_response:
229
+ similar = [k for k in raw_results if k.startswith(f"c{chunk_idx}_")]
230
+ logger.warning("No result for %s — keys for this chunk: %s", cid, similar)
231
+ prop_results = _parse_batch_response(raw_response, n_sents)
232
+
233
+ results_per_property.append(prop_results)
234
+
235
+ # Transpose: prop × sent -> sent × prop
236
+ for sent_idx in range(n_sents):
237
+ row = []
238
+ for prop_results in results_per_property:
239
+ if prop_results is None:
240
+ row.append(None)
241
+ else:
242
+ row.append(
243
+ prop_results[sent_idx] if sent_idx < len(prop_results) else None
244
+ )
245
+ all_results.append(row)
246
+
247
+ return all_results
248
+
249
+
250
+ # ---------------------------------------------------------------------------
251
+ # Helpers
252
+ # ---------------------------------------------------------------------------
253
+
254
+ def _resolve_submodel(models: list[str]) -> str:
255
+ for m in models:
256
+ if m.startswith("mistral_batch_"):
257
+ return m[len("mistral_batch_"):]
258
+ raise ValueError("No mistral_batch_<submodel> entry found in models list")
259
+
260
+
261
+ def _save_preprocessed(preprocessed, lemmes, path: Path) -> None:
262
+ data = {
263
+ "preprocessed": [
264
+ {
265
+ "raw": s.raw,
266
+ "cleaned": s.cleaned,
267
+ "locuteur": s.locuteur,
268
+ "interlocuteurs": s.interlocuteurs,
269
+ "forme_relevee": s.forme_relevee,
270
+ }
271
+ for s in preprocessed
272
+ ],
273
+ "lemmes": lemmes,
274
+ }
275
+ path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
276
+ logger.info("Preprocessed sentences saved to %s", path)
277
+
278
+
279
+ def _load_preprocessed(path: str) -> tuple[list[PreprocessedSentence], list[str] | None]:
280
+ data = json.loads(Path(path).read_text(encoding="utf-8"))
281
+ preprocessed = [
282
+ PreprocessedSentence(
283
+ raw=s["raw"], cleaned=s["cleaned"], locuteur=s["locuteur"],
284
+ interlocuteurs=s["interlocuteurs"], forme_relevee=s["forme_relevee"],
285
+ )
286
+ for s in data["preprocessed"]
287
+ ]
288
+ return preprocessed, data.get("lemmes")
@@ -0,0 +1,27 @@
1
+ # Les parties du discours des modifieurs
2
+ upos:
3
+ - ADV
4
+ - ADJ
5
+ - NOUN
6
+ # dépendances syntaxiques par rapport à la tête de la phrase
7
+ deprel:
8
+ - obl:mod
9
+ - nmod
10
+ - acl:relcl
11
+ - dislocated
12
+ - amod
13
+
14
+ # lemmes surajoutés à la PPI standard
15
+ lemma:
16
+ - dieu
17
+ - diable
18
+
19
+ # à exclure
20
+
21
+ excluded_upos:
22
+ - PUNKT
23
+
24
+ excluded_deprel:
25
+ -
26
+ excluded_lemma:
27
+ -