cnhkmcp 2.1.2__py3-none-any.whl → 2.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. {cnhkmcp-2.1.2.dist-info → cnhkmcp-2.1.3.dist-info}/METADATA +1 -1
  2. cnhkmcp-2.1.3.dist-info/RECORD +6 -0
  3. cnhkmcp-2.1.3.dist-info/top_level.txt +1 -0
  4. cnhkmcp/__init__.py +0 -125
  5. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/README.md +0 -38
  6. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/ace.log +0 -0
  7. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/config.json +0 -6
  8. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/get_knowledgeBase_tool/ace_lib.py +0 -1510
  9. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/get_knowledgeBase_tool/fetch_all_datasets.py +0 -157
  10. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/get_knowledgeBase_tool/fetch_all_documentation.py +0 -132
  11. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/get_knowledgeBase_tool/fetch_all_operators.py +0 -99
  12. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/get_knowledgeBase_tool/helpful_functions.py +0 -180
  13. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/icon.ico +0 -0
  14. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/icon.png +0 -0
  15. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/knowledge/test.txt +0 -1
  16. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/main.py +0 -576
  17. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/process_knowledge_base.py +0 -281
  18. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/rag_engine.py +0 -408
  19. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/requirements.txt +0 -7
  20. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/run.bat +0 -3
  21. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242//321/211/320/266/320/246/321/206/320/274/320/261/321/210/342/224/220/320/240/321/210/320/261/320/234/321/206/320/231/320/243/321/205/342/225/235/320/220/321/206/320/230/320/241.py +0 -265
  22. cnhkmcp/untracked/APP/.gitignore +0 -32
  23. cnhkmcp/untracked/APP/MODULAR_STRUCTURE.md +0 -112
  24. cnhkmcp/untracked/APP/README.md +0 -309
  25. cnhkmcp/untracked/APP/Tranformer/Transformer.py +0 -4985
  26. cnhkmcp/untracked/APP/Tranformer/ace.log +0 -0
  27. cnhkmcp/untracked/APP/Tranformer/ace_lib.py +0 -1510
  28. cnhkmcp/untracked/APP/Tranformer/helpful_functions.py +0 -180
  29. cnhkmcp/untracked/APP/Tranformer/output/Alpha_candidates.json +0 -2421
  30. cnhkmcp/untracked/APP/Tranformer/output/Alpha_candidates_/321/207/320/264/342/225/221/321/204/342/225/233/320/233.json +0 -654
  31. cnhkmcp/untracked/APP/Tranformer/output/Alpha_generated_expressions_error.json +0 -1034
  32. cnhkmcp/untracked/APP/Tranformer/output/Alpha_generated_expressions_success.json +0 -444
  33. cnhkmcp/untracked/APP/Tranformer/output/Alpha_generated_expressions_/321/207/320/264/342/225/221/321/204/342/225/233/320/233/321/205/320/237/320/277/321/207/320/253/342/224/244/321/206/320/236/320/265/321/210/342/225/234/342/225/234/321/205/320/225/320/265Machine_lib.json +0 -22
  34. cnhkmcp/untracked/APP/Tranformer/parsetab.py +0 -60
  35. cnhkmcp/untracked/APP/Tranformer/template_summary.txt +0 -3182
  36. cnhkmcp/untracked/APP/Tranformer/transformer_config.json +0 -7
  37. cnhkmcp/untracked/APP/Tranformer/validator.py +0 -889
  38. cnhkmcp/untracked/APP/ace.log +0 -69
  39. cnhkmcp/untracked/APP/ace_lib.py +0 -1510
  40. cnhkmcp/untracked/APP/blueprints/__init__.py +0 -6
  41. cnhkmcp/untracked/APP/blueprints/feature_engineering.py +0 -347
  42. cnhkmcp/untracked/APP/blueprints/idea_house.py +0 -221
  43. cnhkmcp/untracked/APP/blueprints/inspiration_house.py +0 -432
  44. cnhkmcp/untracked/APP/blueprints/paper_analysis.py +0 -570
  45. cnhkmcp/untracked/APP/custom_templates/templates.json +0 -1257
  46. cnhkmcp/untracked/APP/give_me_idea/BRAIN_Alpha_Template_Expert_SystemPrompt.md +0 -400
  47. cnhkmcp/untracked/APP/give_me_idea/ace_lib.py +0 -1510
  48. cnhkmcp/untracked/APP/give_me_idea/alpha_data_specific_template_master.py +0 -252
  49. cnhkmcp/untracked/APP/give_me_idea/fetch_all_datasets.py +0 -157
  50. cnhkmcp/untracked/APP/give_me_idea/fetch_all_operators.py +0 -99
  51. cnhkmcp/untracked/APP/give_me_idea/helpful_functions.py +0 -180
  52. cnhkmcp/untracked/APP/give_me_idea/what_is_Alpha_template.md +0 -11
  53. cnhkmcp/untracked/APP/helpful_functions.py +0 -180
  54. cnhkmcp/untracked/APP/hkSimulator/ace_lib.py +0 -1497
  55. cnhkmcp/untracked/APP/hkSimulator/autosimulator.py +0 -447
  56. cnhkmcp/untracked/APP/hkSimulator/helpful_functions.py +0 -180
  57. cnhkmcp/untracked/APP/mirror_config.txt +0 -20
  58. cnhkmcp/untracked/APP/operaters.csv +0 -129
  59. cnhkmcp/untracked/APP/requirements.txt +0 -53
  60. cnhkmcp/untracked/APP/run_app.bat +0 -28
  61. cnhkmcp/untracked/APP/run_app.sh +0 -34
  62. cnhkmcp/untracked/APP/setup_tsinghua.bat +0 -39
  63. cnhkmcp/untracked/APP/setup_tsinghua.sh +0 -43
  64. cnhkmcp/untracked/APP/simulator/alpha_submitter.py +0 -404
  65. cnhkmcp/untracked/APP/simulator/simulator_wqb.py +0 -618
  66. cnhkmcp/untracked/APP/ssrn-3332513.pdf +6 -109201
  67. cnhkmcp/untracked/APP/static/brain.js +0 -589
  68. cnhkmcp/untracked/APP/static/decoder.js +0 -1540
  69. cnhkmcp/untracked/APP/static/feature_engineering.js +0 -1729
  70. cnhkmcp/untracked/APP/static/idea_house.js +0 -937
  71. cnhkmcp/untracked/APP/static/inspiration.js +0 -465
  72. cnhkmcp/untracked/APP/static/inspiration_house.js +0 -868
  73. cnhkmcp/untracked/APP/static/paper_analysis.js +0 -390
  74. cnhkmcp/untracked/APP/static/script.js +0 -3082
  75. cnhkmcp/untracked/APP/static/simulator.js +0 -597
  76. cnhkmcp/untracked/APP/static/styles.css +0 -3127
  77. cnhkmcp/untracked/APP/static/usage_widget.js +0 -508
  78. cnhkmcp/untracked/APP/templates/alpha_inspector.html +0 -511
  79. cnhkmcp/untracked/APP/templates/feature_engineering.html +0 -960
  80. cnhkmcp/untracked/APP/templates/idea_house.html +0 -564
  81. cnhkmcp/untracked/APP/templates/index.html +0 -932
  82. cnhkmcp/untracked/APP/templates/inspiration_house.html +0 -861
  83. cnhkmcp/untracked/APP/templates/paper_analysis.html +0 -91
  84. cnhkmcp/untracked/APP/templates/simulator.html +0 -343
  85. cnhkmcp/untracked/APP/templates/transformer_web.html +0 -580
  86. cnhkmcp/untracked/APP/usage.md +0 -351
  87. cnhkmcp/untracked/APP//321/207/342/225/235/320/250/321/205/320/230/320/226/321/204/342/225/225/320/220/321/211/320/221/320/243/321/206/320/261/320/265/ace_lib.py +0 -1510
  88. cnhkmcp/untracked/APP//321/207/342/225/235/320/250/321/205/320/230/320/226/321/204/342/225/225/320/220/321/211/320/221/320/243/321/206/320/261/320/265/brain_alpha_inspector.py +0 -712
  89. cnhkmcp/untracked/APP//321/207/342/225/235/320/250/321/205/320/230/320/226/321/204/342/225/225/320/220/321/211/320/221/320/243/321/206/320/261/320/265/helpful_functions.py +0 -180
  90. cnhkmcp/untracked/APP//321/210/342/224/220/320/240/321/210/320/261/320/234/321/206/320/231/320/243/321/205/342/225/235/320/220/321/206/320/230/320/241.py +0 -2456
  91. cnhkmcp/untracked/arXiv_API_Tool_Manual.md +0 -490
  92. cnhkmcp/untracked/arxiv_api.py +0 -229
  93. cnhkmcp/untracked/forum_functions.py +0 -998
  94. cnhkmcp/untracked/mcp/321/206/320/246/320/227/321/204/342/225/227/342/225/242/321/210/320/276/342/225/221/321/205/320/255/320/253/321/207/320/231/320/2302_/321/205/320/266/320/222/321/206/320/256/320/254/321/205/320/236/320/257/321/207/320/231/320/230/321/205/320/240/320/277/321/205/320/232/320/270/321/204/342/225/225/320/235/321/204/342/225/221/320/226/321/206/342/225/241/320/237/321/210/320/267/320/230/321/205/320/251/320/270/321/205/342/226/221/342/226/222/321/210/320/277/320/245/321/210/342/224/220/320/251/321/204/342/225/225/320/272/forum_functions.py +0 -407
  95. cnhkmcp/untracked/mcp/321/206/320/246/320/227/321/204/342/225/227/342/225/242/321/210/320/276/342/225/221/321/205/320/255/320/253/321/207/320/231/320/2302_/321/205/320/266/320/222/321/206/320/256/320/254/321/205/320/236/320/257/321/207/320/231/320/230/321/205/320/240/320/277/321/205/320/232/320/270/321/204/342/225/225/320/235/321/204/342/225/221/320/226/321/206/342/225/241/320/237/321/210/320/267/320/230/321/205/320/251/320/270/321/205/342/226/221/342/226/222/321/210/320/277/320/245/321/210/342/224/220/320/251/321/204/342/225/225/320/272/platform_functions.py +0 -2415
  96. cnhkmcp/untracked/mcp/321/206/320/246/320/227/321/204/342/225/227/342/225/242/321/210/320/276/342/225/221/321/205/320/255/320/253/321/207/320/231/320/2302_/321/205/320/266/320/222/321/206/320/256/320/254/321/205/320/236/320/257/321/207/320/231/320/230/321/205/320/240/320/277/321/205/320/232/320/270/321/204/342/225/225/320/235/321/204/342/225/221/320/226/321/206/342/225/241/320/237/321/210/320/267/320/230/321/205/320/251/320/270/321/205/342/226/221/342/226/222/321/210/320/277/320/245/321/210/342/224/220/320/251/321/204/342/225/225/320/272/user_config.json +0 -31
  97. cnhkmcp/untracked/mcp/321/206/320/246/320/227/321/204/342/225/227/342/225/242/321/210/320/276/342/225/221/321/205/320/255/320/253/321/207/320/231/320/2302_/321/205/320/266/320/222/321/206/320/256/320/254/321/205/320/236/320/257/321/207/320/231/320/230/321/205/320/240/320/277/321/205/320/232/320/270/321/204/342/225/225/320/235/321/204/342/225/221/320/226/321/206/342/225/241/320/237/321/210/320/267/320/230/321/205/320/251/320/270/321/205/342/226/221/342/226/222/321/210/320/277/320/245/321/210/342/224/220/320/251/321/204/342/225/225/320/272//321/210/320/276/320/271AI/321/210/320/277/342/225/227/321/210/342/224/220/320/251/321/204/342/225/225/320/272/321/206/320/246/320/227/321/206/320/261/320/263/321/206/320/255/320/265/321/205/320/275/320/266/321/204/342/225/235/320/252/321/204/342/225/225/320/233/321/210/342/225/234/342/225/234/321/206/342/225/241/320/237/321/210/320/267/320/230/321/205/320/251/320/270.md +0 -101
  98. cnhkmcp/untracked/mcp/321/206/320/246/320/227/321/204/342/225/227/342/225/242/321/210/320/276/342/225/221/321/205/320/255/320/253/321/207/320/231/320/2302_/321/205/320/266/320/222/321/206/320/256/320/254/321/205/320/236/320/257/321/207/320/231/320/230/321/205/320/240/320/277/321/205/320/232/320/270/321/204/342/225/225/320/235/321/204/342/225/221/320/226/321/206/342/225/241/320/237/321/210/320/267/320/230/321/205/320/251/320/270/321/205/342/226/221/342/226/222/321/210/320/277/320/245/321/210/342/224/220/320/251/321/204/342/225/225/320/272//321/211/320/225/320/235/321/207/342/225/234/320/276/321/205/320/231/320/235/321/210/342/224/220/320/240/321/210/320/261/320/234/321/206/320/230/320/241_/321/205/320/276/320/231/321/210/320/263/320/225/321/205/342/224/220/320/225/321/210/320/266/320/221/321/204/342/225/233/320/255/321/210/342/225/241/320/246/321/205/320/234/320/225.py +0 -190
  99. cnhkmcp/untracked/platform_functions.py +0 -2886
  100. cnhkmcp/untracked/sample_mcp_config.json +0 -11
  101. cnhkmcp/untracked/user_config.json +0 -31
  102. cnhkmcp/untracked//321/207/320/264/342/225/221/321/204/342/225/233/320/233/321/205/320/237/320/222/321/210/320/220/320/223/321/206/320/246/320/227/321/206/320/261/320/263_BRAIN_Alpha_Test_Requirements_and_Tips.md +0 -202
  103. cnhkmcp/untracked//321/207/320/264/342/225/221/321/204/342/225/233/320/233/321/205/342/225/226/320/265/321/204/342/225/234/320/254/321/206/342/225/241/320/221_Alpha_explaination_workflow.md +0 -56
  104. cnhkmcp/untracked//321/207/320/264/342/225/221/321/204/342/225/233/320/233/321/205/342/225/226/320/265/321/204/342/225/234/320/254/321/206/342/225/241/320/221_BRAIN_6_Tips_Datafield_Exploration_Guide.md +0 -194
  105. cnhkmcp/untracked//321/207/320/264/342/225/221/321/204/342/225/233/320/233/321/205/342/225/226/320/265/321/204/342/225/234/320/254/321/206/342/225/241/320/221_BRAIN_Alpha_Improvement_Workflow.md +0 -101
  106. cnhkmcp/untracked//321/207/320/264/342/225/221/321/204/342/225/233/320/233/321/205/342/225/226/320/265/321/204/342/225/234/320/254/321/206/342/225/241/320/221_Dataset_Exploration_Expert_Manual.md +0 -436
  107. cnhkmcp/untracked//321/207/320/264/342/225/221/321/204/342/225/233/320/233/321/205/342/225/226/320/265/321/204/342/225/234/320/254/321/206/342/225/241/320/221_daily_report_workflow.md +0 -128
  108. cnhkmcp/untracked//321/211/320/225/320/235/321/207/342/225/234/320/276/321/205/320/231/320/235/321/210/342/224/220/320/240/321/210/320/261/320/234/321/206/320/230/320/241_/321/205/320/276/320/231/321/210/320/263/320/225/321/205/342/224/220/320/225/321/210/320/266/320/221/321/204/342/225/233/320/255/321/210/342/225/241/320/246/321/205/320/234/320/225.py +0 -190
  109. cnhkmcp-2.1.2.dist-info/RECORD +0 -111
  110. cnhkmcp-2.1.2.dist-info/top_level.txt +0 -1
  111. {cnhkmcp-2.1.2.dist-info → cnhkmcp-2.1.3.dist-info}/WHEEL +0 -0
  112. {cnhkmcp-2.1.2.dist-info → cnhkmcp-2.1.3.dist-info}/entry_points.txt +0 -0
  113. {cnhkmcp-2.1.2.dist-info → cnhkmcp-2.1.3.dist-info}/licenses/LICENSE +0 -0
@@ -1,570 +0,0 @@
1
- """
2
- Paper Analysis Blueprint - Flask Blueprint for analyzing research papers using Deepseek AI
3
- """
4
-
5
- from flask import Blueprint, render_template, request, jsonify
6
- import requests
7
- import json
8
- import os
9
- import tempfile
10
- from werkzeug.utils import secure_filename
11
-
12
- # Create blueprint
13
- paper_analysis_bp = Blueprint('paper_analysis', __name__, url_prefix='/paper-analysis')
14
-
15
- @paper_analysis_bp.route('/')
16
- def paper_analysis():
17
- """Paper analysis page"""
18
- return render_template('paper_analysis.html')
19
-
20
- @paper_analysis_bp.route('/api/test-deepseek', methods=['POST'])
21
- def test_deepseek():
22
- """Test Deepseek API connection"""
23
- try:
24
- api_key = request.headers.get('X-API-Key')
25
- if not api_key:
26
- return jsonify({'error': 'API key is required'}), 401
27
-
28
- # Test API with a simple prompt
29
- headers = {
30
- 'Authorization': f'Bearer {api_key}',
31
- 'Content-Type': 'application/json'
32
- }
33
-
34
- test_response = requests.post(
35
- 'https://api.deepseek.com/v1/chat/completions', # Using chat completions endpoint
36
- headers=headers,
37
- json={
38
- 'model': 'deepseek-chat',
39
- 'messages': [
40
- {'role': 'user', 'content': 'Say hello'}
41
- ],
42
- 'max_tokens': 10
43
- },
44
- timeout=10
45
- )
46
-
47
- if test_response.ok:
48
- return jsonify({
49
- 'success': True,
50
- 'message': 'Deepseek API connection successful',
51
- 'response': test_response.json()
52
- })
53
- else:
54
- return jsonify({
55
- 'success': False,
56
- 'error': f'API Error: {test_response.status_code}',
57
- 'details': test_response.text
58
- }), test_response.status_code
59
-
60
- except requests.exceptions.RequestException as e:
61
- return jsonify({
62
- 'success': False,
63
- 'error': 'Connection error',
64
- 'details': str(e)
65
- }), 500
66
- except Exception as e:
67
- return jsonify({
68
- 'success': False,
69
- 'error': 'Unexpected error',
70
- 'details': str(e)
71
- }), 500
72
-
73
- @paper_analysis_bp.route('/api/analyze-paper', methods=['POST'])
74
- def analyze_paper():
75
- """Analyze paper using Deepseek API"""
76
- try:
77
- # Get API key from header
78
- api_key = request.headers.get('X-API-Key')
79
- if not api_key:
80
- return jsonify({'error': 'API key is required'}), 401
81
-
82
- # Get analysis options
83
- extract_keywords = request.form.get('extract_keywords') == 'true'
84
- generate_summary = request.form.get('generate_summary') == 'true'
85
- find_related = request.form.get('find_related') == 'true'
86
-
87
- # Get uploaded file
88
- if 'file' not in request.files:
89
- return jsonify({'error': 'No file uploaded'}), 400
90
-
91
- file = request.files['file']
92
- if file.filename == '':
93
- return jsonify({'error': 'No file selected'}), 400
94
-
95
- # Check file size (limit to 50MB)
96
- file.seek(0, 2) # Seek to end
97
- file_size = file.tell()
98
- file.seek(0) # Reset to beginning
99
-
100
- if file_size > 50 * 1024 * 1024: # 50MB limit
101
- return jsonify({'error': 'File too large. Maximum size is 50MB'}), 400
102
-
103
- if file_size == 0:
104
- return jsonify({'error': 'File is empty'}), 400
105
-
106
- # Save file temporarily
107
- filename = secure_filename(file.filename)
108
- print(f"Processing file: {filename} (size: {file_size} bytes)")
109
-
110
- with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as temp_file:
111
- file.save(temp_file.name)
112
- file_path = temp_file.name
113
-
114
- try:
115
- # Initialize results dictionary
116
- results = {
117
- 'keywords': [],
118
- 'summary': '',
119
- 'related_works': []
120
- }
121
-
122
- # Extract text from file
123
- text = extract_text_from_file(file_path, filename)
124
-
125
- if not text or not text.strip():
126
- return jsonify({'error': 'Could not extract text from the file. The file might be empty or in an unsupported format.'}), 400
127
-
128
- # Clean up text
129
- text = text.strip()
130
- print(f"Final text length before truncation: {len(text)}")
131
-
132
- # Check if we have enough text
133
- if len(text) < 100:
134
- return jsonify({
135
- 'error': 'Extracted text is too short. This might be a scanned PDF without OCR text. Please ensure your PDF contains selectable text, not just images.'
136
- }), 400
137
-
138
- # Handle large documents
139
- text = process_large_document(text)
140
-
141
- # Call Deepseek API for each requested analysis
142
- headers = {
143
- 'Authorization': f'Bearer {api_key}',
144
- 'Content-Type': 'application/json'
145
- }
146
-
147
- if extract_keywords:
148
- results['keywords'] = extract_keywords_with_deepseek(text, headers)
149
-
150
- if generate_summary:
151
- results['summary'] = generate_summary_with_deepseek(text, headers)
152
-
153
- if find_related:
154
- results['related_works'] = extract_formulas_with_deepseek(text, headers)
155
-
156
- return jsonify(results)
157
-
158
- finally:
159
- # Clean up temporary file
160
- try:
161
- os.unlink(file_path)
162
- except Exception as e:
163
- print(f"Error deleting temporary file: {str(e)}")
164
-
165
- except Exception as e:
166
- print(f"Analyze paper error: {str(e)}")
167
- return jsonify({'error': str(e)}), 500
168
-
169
- def extract_text_from_file(file_path, filename):
170
- """Extract text from various file formats"""
171
- text = ''
172
- file_ext = os.path.splitext(filename)[1].lower()
173
-
174
- try:
175
- if file_ext == '.pdf':
176
- text = extract_pdf_text(file_path)
177
- elif file_ext in ['.docx', '.doc']:
178
- text = extract_word_text(file_path, file_ext)
179
- elif file_ext == '.rtf':
180
- text = extract_rtf_text(file_path)
181
- elif file_ext in ['.tex', '.latex']:
182
- text = extract_latex_text(file_path)
183
- elif file_ext in ['.md', '.markdown']:
184
- text = extract_markdown_text(file_path)
185
- else:
186
- text = extract_plain_text(file_path)
187
-
188
- except Exception as e:
189
- print(f"File processing error: {str(e)}")
190
- raise Exception(f"Error reading file: {str(e)}")
191
-
192
- return text
193
-
194
- def extract_pdf_text(file_path):
195
- """Extract text from PDF files"""
196
- try:
197
- from PyPDF2 import PdfReader
198
- reader = PdfReader(file_path)
199
- text = ''
200
- num_pages = len(reader.pages)
201
- print(f"PDF has {num_pages} pages")
202
-
203
- for i, page in enumerate(reader.pages):
204
- try:
205
- page_text = page.extract_text()
206
- if page_text:
207
- text += page_text + '\n'
208
- print(f"Extracted page {i+1}/{num_pages}")
209
- except Exception as page_error:
210
- print(f"Error extracting page {i+1}: {str(page_error)}")
211
- continue
212
-
213
- print(f"Total extracted text length: {len(text)}")
214
- return text
215
-
216
- except ImportError:
217
- # Try alternative PDF library
218
- try:
219
- import pdfplumber
220
- text = ''
221
- with pdfplumber.open(file_path) as pdf:
222
- for page in pdf.pages:
223
- page_text = page.extract_text()
224
- if page_text:
225
- text += page_text + '\n'
226
- return text
227
- except ImportError:
228
- raise Exception('PDF processing is not available. Please install PyPDF2 or pdfplumber.')
229
- except Exception as pdf_error:
230
- print(f"PDF extraction error: {str(pdf_error)}")
231
- # Try PyMuPDF as fallback
232
- try:
233
- import fitz # PyMuPDF
234
- pdf_document = fitz.open(file_path)
235
- text = ''
236
- for page_num in range(pdf_document.page_count):
237
- page = pdf_document[page_num]
238
- text += page.get_text() + '\n'
239
- pdf_document.close()
240
- return text
241
- except ImportError:
242
- raise Exception(f'Could not extract text from PDF: {str(pdf_error)}. Try installing PyMuPDF.')
243
- except Exception as mupdf_error:
244
- raise Exception(f'PDF extraction failed: {str(pdf_error)}')
245
-
246
- def extract_word_text(file_path, file_ext):
247
- """Extract text from Word documents"""
248
- try:
249
- if file_ext == '.docx':
250
- from docx import Document
251
- doc = Document(file_path)
252
- return '\n'.join([paragraph.text for paragraph in doc.paragraphs])
253
- else:
254
- # .doc files
255
- try:
256
- import docx2txt
257
- return docx2txt.process(file_path)
258
- except ImportError:
259
- raise Exception('DOC file support requires docx2txt. Please install it with: pip install docx2txt')
260
- except ImportError:
261
- raise Exception('Word document support requires python-docx. Please install it with: pip install python-docx')
262
- except Exception as docx_error:
263
- raise Exception(f'Error reading Word document: {str(docx_error)}')
264
-
265
- def extract_rtf_text(file_path):
266
- """Extract text from RTF files"""
267
- try:
268
- import striprtf
269
- with open(file_path, 'r', encoding='utf-8') as f:
270
- rtf_content = f.read()
271
- return striprtf.rtf_to_text(rtf_content)
272
- except ImportError:
273
- raise Exception('RTF support requires striprtf. Please install it with: pip install striprtf')
274
- except Exception as rtf_error:
275
- raise Exception(f'Error reading RTF file: {str(rtf_error)}')
276
-
277
- def extract_latex_text(file_path):
278
- """Extract text from LaTeX files"""
279
- try:
280
- with open(file_path, 'r', encoding='utf-8') as f:
281
- tex_content = f.read()
282
- # Basic LaTeX cleanup - remove common commands
283
- import re
284
- text = tex_content
285
- # Remove comments
286
- text = re.sub(r'%.*$', '', text, flags=re.MULTILINE)
287
- # Remove common LaTeX commands but keep content
288
- text = re.sub(r'\\(begin|end)\{[^}]+\}', '', text)
289
- text = re.sub(r'\\[a-zA-Z]+\*?\{([^}]+)\}', r'\1', text)
290
- text = re.sub(r'\\[a-zA-Z]+\*?', '', text)
291
- return text
292
- except Exception as tex_error:
293
- raise Exception(f'Error reading LaTeX file: {str(tex_error)}')
294
-
295
- def extract_markdown_text(file_path):
296
- """Extract text from Markdown files"""
297
- try:
298
- with open(file_path, 'r', encoding='utf-8') as f:
299
- text = f.read()
300
- # Clean up markdown syntax
301
- import re
302
- # Remove image links
303
- text = re.sub(r'!\[[^\]]*\]\([^)]+\)', '', text)
304
- # Convert links to just text
305
- text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
306
- return text
307
- except Exception as md_error:
308
- raise Exception(f'Error reading Markdown file: {str(md_error)}')
309
-
310
- def extract_plain_text(file_path):
311
- """Extract text from plain text files"""
312
- encodings = ['utf-8', 'utf-16', 'gbk', 'gb2312', 'big5', 'latin-1']
313
- text = None
314
-
315
- for encoding in encodings:
316
- try:
317
- with open(file_path, 'r', encoding=encoding) as f:
318
- text = f.read()
319
- print(f"Successfully read file with {encoding} encoding")
320
- break
321
- except UnicodeDecodeError:
322
- continue
323
- except Exception as e:
324
- print(f"Error reading with {encoding}: {str(e)}")
325
- continue
326
-
327
- if text is None:
328
- # Try reading as binary and decode
329
- with open(file_path, 'rb') as f:
330
- binary_content = f.read()
331
- try:
332
- text = binary_content.decode('utf-8', errors='ignore')
333
- except:
334
- text = str(binary_content)
335
-
336
- return text
337
-
338
- def process_large_document(text):
339
- """Process large documents by prioritizing formula extraction"""
340
- if len(text) > 98000:
341
- print("Large document detected, prioritizing content for formula extraction")
342
- # Try to find sections with formulas (common patterns)
343
- import re
344
- # Look for mathematical content indicators
345
- math_sections = []
346
- lines = text.split('\n')
347
- for i, line in enumerate(lines):
348
- if re.search(r'[=+\-*/∫∑∏√∂∇∆λμσπ]|equation|formula|theorem|lemma|proof', line, re.IGNORECASE):
349
- # Include surrounding context
350
- start = max(0, i-5)
351
- end = min(len(lines), i+6)
352
- math_sections.extend(lines[start:end])
353
-
354
- if math_sections:
355
- # Use math-rich sections for better formula extraction
356
- math_text = '\n'.join(math_sections)
357
- if len(math_text) > 50000: # Still too long
358
- text = math_text[:98000]
359
- else:
360
- # Combine math sections with beginning of document
361
- remaining_space = 98000 - len(math_text)
362
- text = text[:remaining_space] + '\n\n[Mathematical content sections:]\n' + math_text
363
- else:
364
- # No math indicators found, use first part
365
- text = text[:98000]
366
-
367
- return text
368
-
369
- def extract_keywords_with_deepseek(text, headers):
370
- """Extract keywords using Deepseek API"""
371
- try:
372
- keyword_messages = [
373
- {
374
- 'role': 'system',
375
- 'content': 'You are a helpful assistant that extracts keywords from academic papers. Always respond with valid JSON.'
376
- },
377
- {
378
- 'role': 'user',
379
- 'content': f"""Analyze the following academic paper and extract the key technical terms and concepts.
380
- For each keyword, provide a relevance score between 0 and 1.
381
- Return ONLY a valid JSON array of objects with 'text' and 'score' properties.
382
- Example format: [{{"text": "machine learning", "score": 0.95}}, {{"text": "neural networks", "score": 0.85}}]
383
-
384
- Paper text:
385
- {text}"""
386
- }
387
- ]
388
-
389
- keyword_response = requests.post(
390
- 'https://api.deepseek.com/v1/chat/completions',
391
- headers=headers,
392
- json={
393
- 'model': 'deepseek-chat',
394
- 'messages': keyword_messages,
395
- 'temperature': 0.3,
396
- 'max_tokens': 4000
397
- },
398
- timeout=60
399
- )
400
-
401
- if keyword_response.ok:
402
- response_content = keyword_response.json()['choices'][0]['message']['content']
403
- try:
404
- # Try to extract JSON from the response
405
- import re
406
- json_match = re.search(r'\[.*\]', response_content, re.DOTALL)
407
- if json_match:
408
- return json.loads(json_match.group())
409
- else:
410
- return json.loads(response_content)
411
- except json.JSONDecodeError:
412
- print(f"Invalid JSON from keywords API: {response_content}")
413
- return []
414
- else:
415
- print(f"Keywords API error: {keyword_response.text}")
416
- return []
417
-
418
- except Exception as e:
419
- print(f"Error in keywords extraction: {str(e)}")
420
- return []
421
-
422
- def generate_summary_with_deepseek(text, headers):
423
- """Generate summary using Deepseek API"""
424
- try:
425
- summary_messages = [
426
- {
427
- 'role': 'system',
428
- 'content': 'You are a helpful assistant that summarizes academic papers.'
429
- },
430
- {
431
- 'role': 'user',
432
- 'content': f"""Provide a comprehensive summary of the following academic paper.
433
- Focus on the main contributions, methodology, and key findings.
434
- Keep the response concise and well-structured.
435
-
436
- Paper text:
437
- {text}"""
438
- }
439
- ]
440
-
441
- summary_response = requests.post(
442
- 'https://api.deepseek.com/v1/chat/completions',
443
- headers=headers,
444
- json={
445
- 'model': 'deepseek-chat',
446
- 'messages': summary_messages,
447
- 'temperature': 0.3,
448
- 'max_tokens': 4000
449
- },
450
- timeout=60
451
- )
452
-
453
- if summary_response.ok:
454
- return summary_response.json()['choices'][0]['message']['content']
455
- else:
456
- print(f"Summary API error: {summary_response.text}")
457
- return "Error generating summary"
458
-
459
- except Exception as e:
460
- print(f"Error in summary generation: {str(e)}")
461
- return "Error generating summary"
462
-
463
- def extract_formulas_with_deepseek(text, headers):
464
- """Extract formulas using Deepseek API"""
465
- try:
466
- related_messages = [
467
- {
468
- 'role': 'system',
469
- 'content': '''You are an expert mathematician and AI assistant specialized in extracting mathematical formulas from academic papers.
470
- Your task is to identify and extract ALL mathematical formulas, equations, and mathematical expressions from the given text, try as much as you can.
471
-
472
- IMPORTANT INSTRUCTIONS:
473
- 1. Extract EVERY mathematical formula, equation, or expression you find
474
- 2. Include inline formulas, displayed equations, and mathematical definitions
475
- 3. Preserve the original notation as much as possible
476
- 4. For each formula, provide context about what it represents
477
- 5. Always respond with valid JSON format
478
-
479
- You must be thorough and extract ALL formulas, not just the main ones.'''
480
- },
481
- {
482
- 'role': 'user',
483
- 'content': f"""Extract ALL mathematical formulas and equations from the following paper text.
484
-
485
- For each formula found, provide:
486
- - The formula itself (in LaTeX notation if possible)
487
- - A detailed description explaining what the formula represents and what each variable means
488
- - The context or section where it appears
489
- - Whether it's a definition, theorem, lemma, or general equation
490
- - A Chinese description that explains the formula's purpose
491
-
492
- Return a JSON array where each element has these properties:
493
- - "formula": The mathematical expression (use LaTeX notation)
494
- - "description": What the formula represents or calculates
495
- - "variables": Detailed explanation of what each variable/symbol means in the formula
496
- - "variables_chinese": Chinese translation of variable explanations (same structure as variables)
497
- - "type": One of ["definition", "theorem", "lemma", "equation", "inequality", "identity", "other"]
498
- - "context": Brief context about where/how it's used
499
- - "chinese_description": A comprehensive Chinese description of the formula and its purpose
500
-
501
- Example format:
502
- [
503
- {{
504
- "formula": "E = mc^2",
505
- "description": "Einstein's mass-energy equivalence relation",
506
- "variables": {{"E": "energy (joules)", "m": "mass (kilograms)", "c": "speed of light in vacuum (≈3×10^8 m/s)"}},
507
- "variables_chinese": {{"E": "能量 (焦耳)", "m": "质量 (千克)", "c": "真空中的光速 (≈3×10^8 m/s)"}},
508
- "type": "equation",
509
- "context": "Fundamental equation in special relativity theory",
510
- "chinese_description": "爱因斯坦质能等价公式,表示质量和能量之间的等价关系"
511
- }},
512
- {{
513
- "formula": "F = ma",
514
- "description": "Newton's second law of motion",
515
- "variables": {{"F": "net force (newtons)", "m": "mass (kilograms)", "a": "acceleration (m/s²)"}},
516
- "variables_chinese": {{"F": "净力 (牛顿)", "m": "质量 (千克)", "a": "加速度 (m/s²)"}},
517
- "type": "equation",
518
- "context": "Classical mechanics fundamental law",
519
- "chinese_description": "牛顿第二定律,描述物体受力与加速度的关系"
520
- }}
521
- ]
522
-
523
- Paper text:
524
- {text}
525
-
526
- IMPORTANT INSTRUCTIONS:
527
- 1. Extract EVERY formula, even simple ones like "x + y = z" or "f(x) = ax + b"
528
- 2. For each variable or symbol in the formula, explain what it represents
529
- 3. Include units of measurement when relevant
530
- 4. Provide comprehensive Chinese descriptions that explain the formula's significance
531
- 5. Be thorough and detailed in variable explanations"""
532
- }
533
- ]
534
-
535
- related_response = requests.post(
536
- 'https://api.deepseek.com/v1/chat/completions',
537
- headers=headers,
538
- json={
539
- 'model': 'deepseek-chat',
540
- 'messages': related_messages,
541
- 'temperature': 0.1, # Lower temperature for more consistent extraction
542
- 'max_tokens': 4000 # Increased token limit for more formulas
543
- },
544
- timeout=120 # Increased timeout for large documents
545
- )
546
-
547
- if related_response.ok:
548
- response_content = related_response.json()['choices'][0]['message']['content']
549
- try:
550
- # Try to extract JSON from the response
551
- import re
552
- # Look for JSON array in the response
553
- json_match = re.search(r'\[[\s\S]*\]', response_content)
554
- if json_match:
555
- formulas = json.loads(json_match.group())
556
- return formulas
557
- else:
558
- # Try direct JSON parsing
559
- return json.loads(response_content)
560
- except json.JSONDecodeError as e:
561
- print(f"Invalid JSON from formulas API: {response_content}")
562
- print(f"JSON Error: {str(e)}")
563
- return []
564
- else:
565
- print(f"Formulas API error: {related_response.text}")
566
- return []
567
-
568
- except Exception as e:
569
- print(f"Error in formula extraction: {str(e)}")
570
- return []