PyPI - cnhkmcp - Versions diffs - 2.1.9__py3-none-any.whl → 2.3.0__py3-none-any.whl - Mend

cnhkmcp 2.1.9py3-none-any.whl → 2.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (142) hide show

cnhkmcp/untracked/APP/trailSomeAlphas/enhance_template.py ADDED Viewed

@@ -0,0 +1,588 @@
+import json
+import csv
+import os
+import re
+import subprocess
+import sys
+import time
+from pathlib import Path
+import requests
+BASE_DIR = Path(__file__).resolve().parent
+SKILLS_DIR = BASE_DIR / "skills"
+TEMPLATE_ENHANCE_DIR = SKILLS_DIR / "template_final_enhance"
+DEFAULT_FEATURE_IMPLEMENTATION_DIR = SKILLS_DIR / "brain-feature-implementation"
+DEFAULT_FEATURE_IMPLEMENTATION_SCRIPTS = DEFAULT_FEATURE_IMPLEMENTATION_DIR / "scripts"
+DEFAULT_MOONSHOT_MODEL = os.environ.get("MOONSHOT_MODEL", "kimi-k2.5")
+DEFAULT_MAX_ENHANCED_TEMPLATES = int(os.environ.get("MAX_ENHANCED_TEMPLATES", "60"))
+def find_latest_idea_json(feature_implementation_dir: Path) -> Path:
+	data_root = feature_implementation_dir / "data"
+	if not data_root.exists():
+		raise FileNotFoundError(f"data folder not found: {data_root}")
+	idea_files = list(data_root.glob("**/idea_*.json"))
+	if not idea_files:
+		raise FileNotFoundError(f"No idea_*.json found under: {data_root}")
+	# Prefer newest by mtime
+	return max(idea_files, key=lambda p: p.stat().st_mtime)
+def read_text(path: Path) -> str:
+	return path.read_text(encoding="utf-8")
+def call_moonshot(
+	api_key: str,
+	model: str,
+	system_prompt: str,
+	user_prompt: str,
+	timeout_s: int = 180,
+	retries: int = 2,
+	backoff_s: float = 2.0,
+) -> str:
+	base_url = os.environ.get("MOONSHOT_BASE_URL", "https://api.moonshot.cn/v1")
+	url = f"{base_url.rstrip('/')}/chat/completions"
+	headers = {
+		"Authorization": f"Bearer {api_key}",
+		"Content-Type": "application/json",
+	}
+	payload = {
+		"model": model,
+		"messages": [
+			{"role": "system", "content": system_prompt},
+			{"role": "user", "content": user_prompt},
+		],
+		"temperature": 1,
+		# Default to streaming so the user can observe model progress.
+		"stream": True,
+	}
+	def _stream_sse_and_collect(resp: requests.Response) -> str:
+		"""Read OpenAI-compatible SSE stream and print deltas live.
+		Still returns the full accumulated assistant content so existing callers
+		(which expect a string) keep working.
+		"""
+		content_parts: list[str] = []
+		thinking = False
+		for raw_line in resp.iter_lines(decode_unicode=True):
+			if not raw_line:
+				continue
+			line = raw_line.strip()
+			if not line.startswith("data:"):
+				continue
+			data_str = line[5:].strip()
+			if data_str == "[DONE]":
+				break
+			try:
+				event = json.loads(data_str)
+			except Exception:
+				continue
+			choices = event.get("choices") or []
+			if not choices:
+				continue
+			choice0 = choices[0] if isinstance(choices[0], dict) else None
+			if not choice0:
+				continue
+			delta = choice0.get("delta") or {}
+			if not isinstance(delta, dict):
+				delta = {}
+			# Moonshot/Kimi exposes reasoning tokens as `reasoning_content`.
+			reasoning = delta.get("reasoning_content")
+			if reasoning:
+				if not thinking:
+					thinking = True
+					print("=============开始思考=============", flush=True)
+				print(str(reasoning), end="", flush=True)
+			piece = delta.get("content")
+			if piece:
+				if thinking:
+					thinking = False
+					print("\n=============思考结束=============", flush=True)
+				content_parts.append(str(piece))
+				print(str(piece), end="", flush=True)
+			finish_reason = choice0.get("finish_reason")
+			if finish_reason:
+				break
+		if thinking:
+			print("\n=============思考结束=============", flush=True)
+		return "".join(content_parts)
+	last_exc: Exception | None = None
+	for attempt in range(retries + 1):
+		try:
+			resp = requests.post(url, headers=headers, json=payload, timeout=timeout_s, stream=True)
+			resp.encoding = "utf-8"
+			if resp.status_code >= 300:
+				raise RuntimeError(f"Moonshot API error {resp.status_code}: {resp.text}")
+			# Prefer SSE streaming when available.
+			ctype = (resp.headers.get("Content-Type") or "").lower()
+			if "text/event-stream" in ctype or payload.get("stream"):
+				return _stream_sse_and_collect(resp)
+			data = resp.json()
+			return data["choices"][0]["message"]["content"]
+		except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as exc:
+			last_exc = exc
+			if attempt >= retries:
+				raise
+			time.sleep(backoff_s * (2**attempt))
+		except requests.exceptions.RequestException as exc:
+			last_exc = exc
+			if attempt >= retries:
+				raise
+			time.sleep(backoff_s * (2**attempt))
+		except Exception as exc:
+			last_exc = exc
+			if attempt >= retries:
+				raise
+			time.sleep(backoff_s * (2**attempt))
+	raise last_exc or RuntimeError("Moonshot request failed")
+def _salvage_json_array(text: str):
+	"""Try parse JSON, or salvage the first JSON array in a text blob."""
+	try:
+		return json.loads(text)
+	except Exception:
+		m = re.search(r"\[.*\]", text, flags=re.DOTALL)
+		if not m:
+			return None
+		try:
+			return json.loads(m.group(0))
+		except Exception:
+			return None
+def _extract_items(parsed) -> list[dict]:
+	"""Normalize common LLM JSON shapes to a list of dict items."""
+	if isinstance(parsed, list):
+		return [x for x in parsed if isinstance(x, dict)]
+	if isinstance(parsed, dict):
+		for key in ("items", "data", "result", "results", "templates"):
+			val = parsed.get(key)
+			if isinstance(val, list):
+				return [x for x in val if isinstance(x, dict)]
+	return []
+def run_implement_idea(
+	feature_implementation_dir: Path,
+	scripts_dir: Path,
+	dataset_folder: str,
+	template: str,
+	idea: str,
+) -> Path | None:
+	"""Run implement_idea.py for one template.
+	Returns the newly created idea_*.json path if detectable.
+	"""
+	impl = scripts_dir / "implement_idea.py"
+	data_dir = feature_implementation_dir / "data" / dataset_folder
+	before = set(data_dir.glob("*_idea_*.json")) if data_dir.exists() else set()
+	args_list = [
+		sys.executable,
+		str(impl),
+		"--template",
+		template,
+		"--dataset",
+		dataset_folder,
+		"--idea",
+		idea or "",
+	]
+	result = subprocess.run(
+		args_list,
+		cwd=scripts_dir,
+		capture_output=True,
+		text=True,
+	)
+	if result.returncode != 0:
+		raise RuntimeError(
+			"Command failed: "
+			+ " ".join(args_list)
+			+ f"\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}"
+		)
+	after = set(data_dir.glob("*_idea_*.json")) if data_dir.exists() else set()
+	created = sorted(after - before)
+	return created[-1] if created else None
+def _extract_keys_from_template(template: str) -> list[str]:
+	return re.findall(r"\{([A-Za-z0-9_]+)\}", template)
+def _matches_metric(field_id: str, metric: str) -> bool:
+	if len(metric) <= 3:
+		return re.search(rf"(^|_){re.escape(metric)}(_|$)", field_id, flags=re.IGNORECASE) is not None
+	return metric in field_id
+def load_dataset_ids(dataset_csv: Path, max_rows: int = 200000) -> list[str]:
+	ids: list[str] = []
+	with dataset_csv.open("r", encoding="utf-8", newline="") as f:
+		reader = csv.reader(f)
+		header = next(reader, None)
+		if not header:
+			return ids
+		try:
+			id_idx = header.index("id")
+		except ValueError:
+			return ids
+		for i, row in enumerate(reader):
+			if i >= max_rows:
+				break
+			if id_idx < len(row):
+				val = (row[id_idx] or "").strip()
+				if val:
+					ids.append(val)
+	return ids
+def normalize_for_validator(expression: str) -> str:
+	"""Normalize expressions to satisfy validator rules (e.g., winsorize std=).
+	Currently converts winsorize(x, N) -> winsorize(x, std=N)
+	when the second argument is positional and not named.
+	"""
+	def _rewrite_func(expr: str, func_name: str, param_name: str) -> str:
+		out = []
+		i = 0
+		func_token = f"{func_name}("
+		while i < len(expr):
+			idx = expr.find(func_token, i)
+			if idx == -1:
+				out.append(expr[i:])
+				break
+			out.append(expr[i:idx])
+			j = idx + len(func_token)
+			depth = 1
+			args_start = j
+			while j < len(expr) and depth > 0:
+				ch = expr[j]
+				if ch == '(':
+					depth += 1
+				elif ch == ')':
+					depth -= 1
+				j += 1
+			args_str = expr[args_start : j - 1]
+			# split top-level args
+			args = []
+			buf = []
+			depth2 = 0
+			for ch in args_str:
+				if ch == '(':
+					depth2 += 1
+				elif ch == ')':
+					depth2 -= 1
+				if ch == ',' and depth2 == 0:
+					args.append("".join(buf).strip())
+					buf = []
+				else:
+					buf.append(ch)
+			if buf:
+				args.append("".join(buf).strip())
+			if len(args) >= 2 and '=' not in args[1]:
+				args[1] = f"{param_name}={args[1]}"
+			new_args = ", ".join(args)
+			out.append(f"{func_name}({new_args})")
+			i = j
+		return "".join(out)
+	normalized = expression
+	normalized = _rewrite_func(normalized, "winsorize", "std")
+	return normalized
+def parse_metadata_from_filename(path: Path) -> tuple[str, str, int] | None:
+	"""Extract dataset_id, region, delay from filename like:
+	<dataset>_<region>_<delay>_idea_<timestamp>.json
+	Returns (dataset_id, region, delay) or None.
+	"""
+	name = path.name
+	parts = name.split("_")
+	if len(parts) < 5:
+		return None
+	if parts[-2] != "idea":
+		return None
+	region = parts[-4]
+	delay_str = parts[-3]
+	if not delay_str.isdigit():
+		return None
+	dataset_id = "_".join(parts[:-4])
+	if not dataset_id:
+		return None
+	return dataset_id, region, int(delay_str)
+def main():
+	"""Zero-arg entrypoint.
+	Behavior:
+	- Pick the newest idea_*.json under skills/brain-feature-implementation/data/**
+	- Generate as many enhanced templates as possible (capped by DEFAULT_MAX_ENHANCED_TEMPLATES)
+	- Save enhanced templates JSON and (optionally) implement them
+	Optional env overrides:
+	- IDEA_JSON: absolute/relative path to a specific idea_*.json
+	- MOONSHOT_API_KEY / MOONSHOT_BASE_URL / MOONSHOT_MODEL
+	- MAX_ENHANCED_TEMPLATES (default 60)
+	- NO_IMPLEMENT=1 to skip implement_idea.py
+	"""
+	idea_json_env = os.environ.get("IDEA_JSON", "").strip()
+	if idea_json_env:
+		idea_json_path = Path(idea_json_env).expanduser().resolve()
+	else:
+		idea_json_path = find_latest_idea_json(DEFAULT_FEATURE_IMPLEMENTATION_DIR).resolve()
+	if not idea_json_path.exists():
+		raise FileNotFoundError(f"idea json not found: {idea_json_path}")
+	print(f"Using idea json: {idea_json_path}")
+	payload = json.loads(idea_json_path.read_text(encoding="utf-8"))
+	if not isinstance(payload, dict):
+		raise ValueError("idea json must be an object with 'template' and 'idea' fields")
+	if "template" not in payload:
+		raise ValueError("idea json missing required field 'template'")
+	if "idea" not in payload:
+		raise ValueError("idea json missing required field 'idea'")
+	raw_template = str(payload.get("template") or "").strip()
+	raw_idea = str(payload.get("idea") or "").strip()
+	if not raw_template:
+		raise ValueError("idea json field 'template' is empty")
+	# Infer feature-implementation location from the idea json path when possible:
+	# <...>/brain-feature-implementation/data/<dataset_folder>/idea_*.json
+	feature_implementation_dir = DEFAULT_FEATURE_IMPLEMENTATION_DIR
+	scripts_dir = DEFAULT_FEATURE_IMPLEMENTATION_SCRIPTS
+	try:
+		if idea_json_path.parent.parent.name.lower() == "data":
+			feature_implementation_dir = idea_json_path.parent.parent.parent
+			scripts_dir = feature_implementation_dir / "scripts"
+	except Exception:
+		pass
+	if not scripts_dir.exists():
+		raise FileNotFoundError(f"implement scripts folder not found: {scripts_dir}")
+	if idea_json_path.parent.parent.name.lower() == "data":
+		dataset_folder = idea_json_path.parent.name
+	else:
+		parsed = parse_metadata_from_filename(idea_json_path)
+		if not parsed:
+			raise ValueError(
+				"idea json filename must be like <dataset>_<region>_<delay>_idea_<ts>.json "
+				"when not located under brain-feature-implementation/data/<dataset_folder>/"
+			)
+		dataset_id_from_name, region_from_name, delay_from_name = parsed
+		dataset_folder = f"{dataset_id_from_name}_{region_from_name}_delay{delay_from_name}"
+	# Validate dataset CSV exists to ensure implement_idea can parse placeholders.
+	dataset_csv = feature_implementation_dir / "data" / dataset_folder / f"{dataset_folder}.csv"
+	if not dataset_csv.exists():
+		raise FileNotFoundError(
+			"Dataset CSV not found for enhancement. "
+			f"Expected: {dataset_csv}"
+		)
+	print(f"Using dataset CSV: {dataset_csv}")
+	try:
+		with dataset_csv.open("r", encoding="utf-8", newline="") as f:
+			reader = csv.reader(f)
+			header = next(reader, None)
+			row_count = 0
+			for _ in reader:
+				row_count += 1
+				if row_count >= 5:
+					break
+		if not header:
+			raise ValueError("Dataset CSV missing header row")
+		if row_count == 0:
+			raise ValueError("Dataset CSV has no data rows")
+		print(f"Dataset CSV header columns: {len(header)}; sample rows: {row_count}")
+	except Exception as e:
+		raise RuntimeError(f"Failed to read dataset CSV: {e}")
+	dataset_ids = load_dataset_ids(dataset_csv)
+	if not dataset_ids:
+		print("Warning: Could not load dataset ids from CSV 'id' column.")
+	guide1_path = TEMPLATE_ENHANCE_DIR / "单因子思考逻辑链.md"
+	guide2_path = TEMPLATE_ENHANCE_DIR / "op总结.md"
+	if not guide1_path.exists():
+		raise FileNotFoundError(f"Missing guidance file: {guide1_path}")
+	if not guide2_path.exists():
+		raise FileNotFoundError(f"Missing guidance file: {guide2_path}")
+	guide1 = read_text(guide1_path)
+	guide2 = read_text(guide2_path)
+	system_prompt = "\n\n".join(
+		[
+			"An alpha template is a reusable recipe that captures an economic idea and leaves “slots” (data fields, operators, groups, decay, neutralization choices, etc.) to instantiate many candidate alphas. Typical structure: clean data (backfill, winsorize) → transform/compare across time or peers → rank/neutralize → (optionally) decay/turnover tune. Templates encourage systematic search, reuse, and diversification while keeping an explicit economic rationale.",
+			"",
+			"Some Example Templates and rationales",
+			"",
+			"CAPM residual (market/sector-neutral return): ts_regression(returns, group_mean(returns, log(ts_mean(cap,21)), sector), 252, rettype=0) after backfill+winsorize. Rationale: strip market/sector beta to isolate idiosyncratic alpha; sector-weighted by smoothed log-cap to reduce large-cap dominance.",
+			"CAPM beta (slope) template: same regression with rettype=2; pre-clean target/market (ts_backfill(...,63) + winsorize(std=4)). Rationale: rank stocks by relative risk within sector; long low-β, short high-β, or study β dispersion across groups.",
+			"CAPM generalized to any feature: data = winsorize(ts_backfill(<data>,63),std=4); data_gpm = group_mean(data, log(ts_mean(cap,21)), sector); resid = ts_regression(data, data_gpm, 252, rettype=0). Rationale: pull out the component unexplained by group average of same feature; reduces common-mode exposure.",
+			"Actual vs estimate spread (analyst): group_zscore( group_zscore(<act>, industry) – group_zscore(<est>, industry), industry ) or the abstracted group_compare(diff(group_compare(act,...), group_compare(est,...)), ...). Rationale: surprise/beat-miss signal within industry, normalized to peers to avoid level bias.",
+			"Analyst term-structure (fp1 vs fy1/fp2/fy2): group_zscore( group_zscore(anl14_mean_eps_<period1>, industry) – group_zscore(anl14_mean_eps_<period2>, industry), industry ) with operator/group slots. Rationale: cross-period expectation steepness; rising near-term vs long-term forecasts can flag momentum/inflection.",
+			"Option Greeks net spread: group_operator(<put_greek> - <call_greek>, <grouping_data>) over industry/sector (Delta/Gamma/Vega/Theta). Rationale: options-implied sentiment/convexity skew vs peers; outlier net Greeks may precede spot moves; extend with multi-Greek composites or time-series deltas.",
+			"",
+			"based on the following guidance of how to make a data collation template into a signal, and guidance on how to utilize the best of operators.",
+			"",
+			"guidance of how to make a data collation template into a signal",
+			"--------------",
+			guide1,
+			"--------------",
+			"guidance on how to use the best of operators",
+			"--------------",
+			guide2,
+			"--------------",
+			"",
+			"Return ONLY valid JSON (no markdown / no code fences).",
+		]
+	)
+	user_prompt_obj = {
+		"instruction": "Improve the following raw template. Keep { } placeholders unchanged (they represent datafields). Return at least 5 diverse and complicate enhanced templates as possible.",
+		"input": {
+			"template": raw_template,
+			"idea": raw_idea,
+		},
+		"output_format": [
+			{"enhanced_template": "", "idea": ""},
+			{"enhanced_template": "", "idea": ""},
+		],
+		"idea_answer_in": "Chinese",
+	}
+	api_key = os.environ.get("MOONSHOT_API_KEY")
+	if not api_key:
+		raise ValueError("Missing Moonshot API key. Set MOONSHOT_API_KEY")
+	raw = call_moonshot(
+		api_key=api_key,
+		model=DEFAULT_MOONSHOT_MODEL,
+		system_prompt=system_prompt,
+		user_prompt=json.dumps(user_prompt_obj, ensure_ascii=False, indent=2),
+		timeout_s=600,
+		retries=2,
+		backoff_s=2.0,
+	)
+	parsed = _salvage_json_array(raw)
+	items = _extract_items(parsed)
+	if not items:
+		raise RuntimeError(f"LLM output did not contain a usable JSON array. Raw output:\n{raw}")
+	enhanced = []
+	for item in items:
+		t = str(item.get("enhanced_template") or item.get("template") or "").strip()
+		idea = str(item.get("idea") or "").strip()
+		if not t:
+			continue
+		enhanced.append({"template": t, "idea": idea})
+		# Do NOT truncate here; keep all returned templates.
+	if not enhanced:
+		raise RuntimeError(f"No enhanced templates parsed from LLM output. Raw output:\n{raw}")
+	out_dir = idea_json_path.parent
+	ts = int(time.time())
+	enhanced_path = out_dir / f"enhanced_templates_{ts}.json"
+	enhanced_path.write_text(json.dumps(enhanced, ensure_ascii=False, indent=2), encoding="utf-8")
+	print(f"Enhanced templates saved to: {enhanced_path}")
+	# Implement each enhanced template and merge expressions for this run only.
+	all_exprs = []
+	created_files = []
+	for idx, item in enumerate(enhanced, start=1):
+		t = item.get("template")
+		if not t:
+			continue
+		idea = item.get("idea") or ""
+		if dataset_ids:
+			metrics = _extract_keys_from_template(t)
+			missing = [m for m in metrics if not any(_matches_metric(fid, m) for fid in dataset_ids)]
+			if missing:
+				print(f"Template {idx} missing metrics in CSV id list: {missing}")
+		print(f"\n[{idx}/{len(enhanced)}] Implementing enhanced_template: {t}")
+		created = run_implement_idea(
+			feature_implementation_dir=feature_implementation_dir,
+			scripts_dir=scripts_dir,
+			dataset_folder=dataset_folder,
+			template=t,
+			idea=idea,
+		)
+		if created:
+			created_files.append(created)
+	for jf in created_files:
+		try:
+			data = json.loads(jf.read_text(encoding="utf-8"))
+			exprs = data.get("expression_list", [])
+			if exprs:
+				all_exprs.extend([str(x) for x in exprs])
+		except Exception:
+			pass
+	unique = []
+	seen = set()
+	for ex in all_exprs:
+		norm_ex = normalize_for_validator(ex)
+		if norm_ex not in seen:
+			unique.append(norm_ex)
+			seen.add(norm_ex)
+	# Validate expressions and keep only valid ones
+	try:
+		if str(scripts_dir) not in sys.path:
+			sys.path.insert(0, str(scripts_dir))
+		from validator import ExpressionValidator  # type: ignore
+		validator = ExpressionValidator()
+		validated = []
+		for expr in unique:
+			result = validator.check_expression(expr)
+			if result.get("valid"):
+				validated.append(expr)
+			else:
+				print(f"Invalid expression filtered: {expr}")
+		unique = validated
+		print(f"Validation kept {len(unique)} expressions")
+	except Exception as e:
+		print(f"Warning: validator failed, keeping unvalidated expressions. Error: {e}")
+	merged_path = out_dir / f"enhanced_final_expressions_{ts}.json"
+	merged_path.write_text(json.dumps(unique, ensure_ascii=False, indent=2), encoding="utf-8")
+	print(f"\nMerged {len(unique)} unique expressions to: {merged_path}")
+if __name__ == "__main__":
+	main()

cnhkmcp/untracked/APP/trailSomeAlphas/requirements.txt ADDED Viewed

@@ -0,0 +1,3 @@
+pandas
+requests
+tqdm

cnhkmcp 2.1.9__py3-none-any.whl → 2.3.0__py3-none-any.whl

cnhkmcp 2.1.9py3-none-any.whl → 2.3.0py3-none-any.whl