cnhkmcp 2.1.2__py3-none-any.whl → 2.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cnhkmcp-2.1.2.dist-info → cnhkmcp-2.1.3.dist-info}/METADATA +1 -1
- cnhkmcp-2.1.3.dist-info/RECORD +6 -0
- cnhkmcp-2.1.3.dist-info/top_level.txt +1 -0
- cnhkmcp/__init__.py +0 -125
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/README.md +0 -38
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/ace.log +0 -0
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/config.json +0 -6
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/get_knowledgeBase_tool/ace_lib.py +0 -1510
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/get_knowledgeBase_tool/fetch_all_datasets.py +0 -157
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/get_knowledgeBase_tool/fetch_all_documentation.py +0 -132
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/get_knowledgeBase_tool/fetch_all_operators.py +0 -99
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/get_knowledgeBase_tool/helpful_functions.py +0 -180
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/icon.ico +0 -0
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/icon.png +0 -0
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/knowledge/test.txt +0 -1
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/main.py +0 -576
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/process_knowledge_base.py +0 -281
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/rag_engine.py +0 -408
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/requirements.txt +0 -7
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/run.bat +0 -3
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242//321/211/320/266/320/246/321/206/320/274/320/261/321/210/342/224/220/320/240/321/210/320/261/320/234/321/206/320/231/320/243/321/205/342/225/235/320/220/321/206/320/230/320/241.py +0 -265
- cnhkmcp/untracked/APP/.gitignore +0 -32
- cnhkmcp/untracked/APP/MODULAR_STRUCTURE.md +0 -112
- cnhkmcp/untracked/APP/README.md +0 -309
- cnhkmcp/untracked/APP/Tranformer/Transformer.py +0 -4985
- cnhkmcp/untracked/APP/Tranformer/ace.log +0 -0
- cnhkmcp/untracked/APP/Tranformer/ace_lib.py +0 -1510
- cnhkmcp/untracked/APP/Tranformer/helpful_functions.py +0 -180
- cnhkmcp/untracked/APP/Tranformer/output/Alpha_candidates.json +0 -2421
- cnhkmcp/untracked/APP/Tranformer/output/Alpha_candidates_/321/207/320/264/342/225/221/321/204/342/225/233/320/233.json +0 -654
- cnhkmcp/untracked/APP/Tranformer/output/Alpha_generated_expressions_error.json +0 -1034
- cnhkmcp/untracked/APP/Tranformer/output/Alpha_generated_expressions_success.json +0 -444
- cnhkmcp/untracked/APP/Tranformer/output/Alpha_generated_expressions_/321/207/320/264/342/225/221/321/204/342/225/233/320/233/321/205/320/237/320/277/321/207/320/253/342/224/244/321/206/320/236/320/265/321/210/342/225/234/342/225/234/321/205/320/225/320/265Machine_lib.json +0 -22
- cnhkmcp/untracked/APP/Tranformer/parsetab.py +0 -60
- cnhkmcp/untracked/APP/Tranformer/template_summary.txt +0 -3182
- cnhkmcp/untracked/APP/Tranformer/transformer_config.json +0 -7
- cnhkmcp/untracked/APP/Tranformer/validator.py +0 -889
- cnhkmcp/untracked/APP/ace.log +0 -69
- cnhkmcp/untracked/APP/ace_lib.py +0 -1510
- cnhkmcp/untracked/APP/blueprints/__init__.py +0 -6
- cnhkmcp/untracked/APP/blueprints/feature_engineering.py +0 -347
- cnhkmcp/untracked/APP/blueprints/idea_house.py +0 -221
- cnhkmcp/untracked/APP/blueprints/inspiration_house.py +0 -432
- cnhkmcp/untracked/APP/blueprints/paper_analysis.py +0 -570
- cnhkmcp/untracked/APP/custom_templates/templates.json +0 -1257
- cnhkmcp/untracked/APP/give_me_idea/BRAIN_Alpha_Template_Expert_SystemPrompt.md +0 -400
- cnhkmcp/untracked/APP/give_me_idea/ace_lib.py +0 -1510
- cnhkmcp/untracked/APP/give_me_idea/alpha_data_specific_template_master.py +0 -252
- cnhkmcp/untracked/APP/give_me_idea/fetch_all_datasets.py +0 -157
- cnhkmcp/untracked/APP/give_me_idea/fetch_all_operators.py +0 -99
- cnhkmcp/untracked/APP/give_me_idea/helpful_functions.py +0 -180
- cnhkmcp/untracked/APP/give_me_idea/what_is_Alpha_template.md +0 -11
- cnhkmcp/untracked/APP/helpful_functions.py +0 -180
- cnhkmcp/untracked/APP/hkSimulator/ace_lib.py +0 -1497
- cnhkmcp/untracked/APP/hkSimulator/autosimulator.py +0 -447
- cnhkmcp/untracked/APP/hkSimulator/helpful_functions.py +0 -180
- cnhkmcp/untracked/APP/mirror_config.txt +0 -20
- cnhkmcp/untracked/APP/operaters.csv +0 -129
- cnhkmcp/untracked/APP/requirements.txt +0 -53
- cnhkmcp/untracked/APP/run_app.bat +0 -28
- cnhkmcp/untracked/APP/run_app.sh +0 -34
- cnhkmcp/untracked/APP/setup_tsinghua.bat +0 -39
- cnhkmcp/untracked/APP/setup_tsinghua.sh +0 -43
- cnhkmcp/untracked/APP/simulator/alpha_submitter.py +0 -404
- cnhkmcp/untracked/APP/simulator/simulator_wqb.py +0 -618
- cnhkmcp/untracked/APP/ssrn-3332513.pdf +6 -109201
- cnhkmcp/untracked/APP/static/brain.js +0 -589
- cnhkmcp/untracked/APP/static/decoder.js +0 -1540
- cnhkmcp/untracked/APP/static/feature_engineering.js +0 -1729
- cnhkmcp/untracked/APP/static/idea_house.js +0 -937
- cnhkmcp/untracked/APP/static/inspiration.js +0 -465
- cnhkmcp/untracked/APP/static/inspiration_house.js +0 -868
- cnhkmcp/untracked/APP/static/paper_analysis.js +0 -390
- cnhkmcp/untracked/APP/static/script.js +0 -3082
- cnhkmcp/untracked/APP/static/simulator.js +0 -597
- cnhkmcp/untracked/APP/static/styles.css +0 -3127
- cnhkmcp/untracked/APP/static/usage_widget.js +0 -508
- cnhkmcp/untracked/APP/templates/alpha_inspector.html +0 -511
- cnhkmcp/untracked/APP/templates/feature_engineering.html +0 -960
- cnhkmcp/untracked/APP/templates/idea_house.html +0 -564
- cnhkmcp/untracked/APP/templates/index.html +0 -932
- cnhkmcp/untracked/APP/templates/inspiration_house.html +0 -861
- cnhkmcp/untracked/APP/templates/paper_analysis.html +0 -91
- cnhkmcp/untracked/APP/templates/simulator.html +0 -343
- cnhkmcp/untracked/APP/templates/transformer_web.html +0 -580
- cnhkmcp/untracked/APP/usage.md +0 -351
- cnhkmcp/untracked/APP//321/207/342/225/235/320/250/321/205/320/230/320/226/321/204/342/225/225/320/220/321/211/320/221/320/243/321/206/320/261/320/265/ace_lib.py +0 -1510
- cnhkmcp/untracked/APP//321/207/342/225/235/320/250/321/205/320/230/320/226/321/204/342/225/225/320/220/321/211/320/221/320/243/321/206/320/261/320/265/brain_alpha_inspector.py +0 -712
- cnhkmcp/untracked/APP//321/207/342/225/235/320/250/321/205/320/230/320/226/321/204/342/225/225/320/220/321/211/320/221/320/243/321/206/320/261/320/265/helpful_functions.py +0 -180
- cnhkmcp/untracked/APP//321/210/342/224/220/320/240/321/210/320/261/320/234/321/206/320/231/320/243/321/205/342/225/235/320/220/321/206/320/230/320/241.py +0 -2456
- cnhkmcp/untracked/arXiv_API_Tool_Manual.md +0 -490
- cnhkmcp/untracked/arxiv_api.py +0 -229
- cnhkmcp/untracked/forum_functions.py +0 -998
- cnhkmcp/untracked/mcp/321/206/320/246/320/227/321/204/342/225/227/342/225/242/321/210/320/276/342/225/221/321/205/320/255/320/253/321/207/320/231/320/2302_/321/205/320/266/320/222/321/206/320/256/320/254/321/205/320/236/320/257/321/207/320/231/320/230/321/205/320/240/320/277/321/205/320/232/320/270/321/204/342/225/225/320/235/321/204/342/225/221/320/226/321/206/342/225/241/320/237/321/210/320/267/320/230/321/205/320/251/320/270/321/205/342/226/221/342/226/222/321/210/320/277/320/245/321/210/342/224/220/320/251/321/204/342/225/225/320/272/forum_functions.py +0 -407
- cnhkmcp/untracked/mcp/321/206/320/246/320/227/321/204/342/225/227/342/225/242/321/210/320/276/342/225/221/321/205/320/255/320/253/321/207/320/231/320/2302_/321/205/320/266/320/222/321/206/320/256/320/254/321/205/320/236/320/257/321/207/320/231/320/230/321/205/320/240/320/277/321/205/320/232/320/270/321/204/342/225/225/320/235/321/204/342/225/221/320/226/321/206/342/225/241/320/237/321/210/320/267/320/230/321/205/320/251/320/270/321/205/342/226/221/342/226/222/321/210/320/277/320/245/321/210/342/224/220/320/251/321/204/342/225/225/320/272/platform_functions.py +0 -2415
- cnhkmcp/untracked/mcp/321/206/320/246/320/227/321/204/342/225/227/342/225/242/321/210/320/276/342/225/221/321/205/320/255/320/253/321/207/320/231/320/2302_/321/205/320/266/320/222/321/206/320/256/320/254/321/205/320/236/320/257/321/207/320/231/320/230/321/205/320/240/320/277/321/205/320/232/320/270/321/204/342/225/225/320/235/321/204/342/225/221/320/226/321/206/342/225/241/320/237/321/210/320/267/320/230/321/205/320/251/320/270/321/205/342/226/221/342/226/222/321/210/320/277/320/245/321/210/342/224/220/320/251/321/204/342/225/225/320/272/user_config.json +0 -31
- cnhkmcp/untracked/mcp/321/206/320/246/320/227/321/204/342/225/227/342/225/242/321/210/320/276/342/225/221/321/205/320/255/320/253/321/207/320/231/320/2302_/321/205/320/266/320/222/321/206/320/256/320/254/321/205/320/236/320/257/321/207/320/231/320/230/321/205/320/240/320/277/321/205/320/232/320/270/321/204/342/225/225/320/235/321/204/342/225/221/320/226/321/206/342/225/241/320/237/321/210/320/267/320/230/321/205/320/251/320/270/321/205/342/226/221/342/226/222/321/210/320/277/320/245/321/210/342/224/220/320/251/321/204/342/225/225/320/272//321/210/320/276/320/271AI/321/210/320/277/342/225/227/321/210/342/224/220/320/251/321/204/342/225/225/320/272/321/206/320/246/320/227/321/206/320/261/320/263/321/206/320/255/320/265/321/205/320/275/320/266/321/204/342/225/235/320/252/321/204/342/225/225/320/233/321/210/342/225/234/342/225/234/321/206/342/225/241/320/237/321/210/320/267/320/230/321/205/320/251/320/270.md +0 -101
- cnhkmcp/untracked/mcp/321/206/320/246/320/227/321/204/342/225/227/342/225/242/321/210/320/276/342/225/221/321/205/320/255/320/253/321/207/320/231/320/2302_/321/205/320/266/320/222/321/206/320/256/320/254/321/205/320/236/320/257/321/207/320/231/320/230/321/205/320/240/320/277/321/205/320/232/320/270/321/204/342/225/225/320/235/321/204/342/225/221/320/226/321/206/342/225/241/320/237/321/210/320/267/320/230/321/205/320/251/320/270/321/205/342/226/221/342/226/222/321/210/320/277/320/245/321/210/342/224/220/320/251/321/204/342/225/225/320/272//321/211/320/225/320/235/321/207/342/225/234/320/276/321/205/320/231/320/235/321/210/342/224/220/320/240/321/210/320/261/320/234/321/206/320/230/320/241_/321/205/320/276/320/231/321/210/320/263/320/225/321/205/342/224/220/320/225/321/210/320/266/320/221/321/204/342/225/233/320/255/321/210/342/225/241/320/246/321/205/320/234/320/225.py +0 -190
- cnhkmcp/untracked/platform_functions.py +0 -2886
- cnhkmcp/untracked/sample_mcp_config.json +0 -11
- cnhkmcp/untracked/user_config.json +0 -31
- cnhkmcp/untracked//321/207/320/264/342/225/221/321/204/342/225/233/320/233/321/205/320/237/320/222/321/210/320/220/320/223/321/206/320/246/320/227/321/206/320/261/320/263_BRAIN_Alpha_Test_Requirements_and_Tips.md +0 -202
- cnhkmcp/untracked//321/207/320/264/342/225/221/321/204/342/225/233/320/233/321/205/342/225/226/320/265/321/204/342/225/234/320/254/321/206/342/225/241/320/221_Alpha_explaination_workflow.md +0 -56
- cnhkmcp/untracked//321/207/320/264/342/225/221/321/204/342/225/233/320/233/321/205/342/225/226/320/265/321/204/342/225/234/320/254/321/206/342/225/241/320/221_BRAIN_6_Tips_Datafield_Exploration_Guide.md +0 -194
- cnhkmcp/untracked//321/207/320/264/342/225/221/321/204/342/225/233/320/233/321/205/342/225/226/320/265/321/204/342/225/234/320/254/321/206/342/225/241/320/221_BRAIN_Alpha_Improvement_Workflow.md +0 -101
- cnhkmcp/untracked//321/207/320/264/342/225/221/321/204/342/225/233/320/233/321/205/342/225/226/320/265/321/204/342/225/234/320/254/321/206/342/225/241/320/221_Dataset_Exploration_Expert_Manual.md +0 -436
- cnhkmcp/untracked//321/207/320/264/342/225/221/321/204/342/225/233/320/233/321/205/342/225/226/320/265/321/204/342/225/234/320/254/321/206/342/225/241/320/221_daily_report_workflow.md +0 -128
- cnhkmcp/untracked//321/211/320/225/320/235/321/207/342/225/234/320/276/321/205/320/231/320/235/321/210/342/224/220/320/240/321/210/320/261/320/234/321/206/320/230/320/241_/321/205/320/276/320/231/321/210/320/263/320/225/321/205/342/224/220/320/225/321/210/320/266/320/221/321/204/342/225/233/320/255/321/210/342/225/241/320/246/321/205/320/234/320/225.py +0 -190
- cnhkmcp-2.1.2.dist-info/RECORD +0 -111
- cnhkmcp-2.1.2.dist-info/top_level.txt +0 -1
- {cnhkmcp-2.1.2.dist-info → cnhkmcp-2.1.3.dist-info}/WHEEL +0 -0
- {cnhkmcp-2.1.2.dist-info → cnhkmcp-2.1.3.dist-info}/entry_points.txt +0 -0
- {cnhkmcp-2.1.2.dist-info → cnhkmcp-2.1.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,570 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Paper Analysis Blueprint - Flask Blueprint for analyzing research papers using Deepseek AI
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
from flask import Blueprint, render_template, request, jsonify
|
|
6
|
-
import requests
|
|
7
|
-
import json
|
|
8
|
-
import os
|
|
9
|
-
import tempfile
|
|
10
|
-
from werkzeug.utils import secure_filename
|
|
11
|
-
|
|
12
|
-
# Create blueprint
|
|
13
|
-
paper_analysis_bp = Blueprint('paper_analysis', __name__, url_prefix='/paper-analysis')
|
|
14
|
-
|
|
15
|
-
@paper_analysis_bp.route('/')
|
|
16
|
-
def paper_analysis():
|
|
17
|
-
"""Paper analysis page"""
|
|
18
|
-
return render_template('paper_analysis.html')
|
|
19
|
-
|
|
20
|
-
@paper_analysis_bp.route('/api/test-deepseek', methods=['POST'])
|
|
21
|
-
def test_deepseek():
|
|
22
|
-
"""Test Deepseek API connection"""
|
|
23
|
-
try:
|
|
24
|
-
api_key = request.headers.get('X-API-Key')
|
|
25
|
-
if not api_key:
|
|
26
|
-
return jsonify({'error': 'API key is required'}), 401
|
|
27
|
-
|
|
28
|
-
# Test API with a simple prompt
|
|
29
|
-
headers = {
|
|
30
|
-
'Authorization': f'Bearer {api_key}',
|
|
31
|
-
'Content-Type': 'application/json'
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
test_response = requests.post(
|
|
35
|
-
'https://api.deepseek.com/v1/chat/completions', # Using chat completions endpoint
|
|
36
|
-
headers=headers,
|
|
37
|
-
json={
|
|
38
|
-
'model': 'deepseek-chat',
|
|
39
|
-
'messages': [
|
|
40
|
-
{'role': 'user', 'content': 'Say hello'}
|
|
41
|
-
],
|
|
42
|
-
'max_tokens': 10
|
|
43
|
-
},
|
|
44
|
-
timeout=10
|
|
45
|
-
)
|
|
46
|
-
|
|
47
|
-
if test_response.ok:
|
|
48
|
-
return jsonify({
|
|
49
|
-
'success': True,
|
|
50
|
-
'message': 'Deepseek API connection successful',
|
|
51
|
-
'response': test_response.json()
|
|
52
|
-
})
|
|
53
|
-
else:
|
|
54
|
-
return jsonify({
|
|
55
|
-
'success': False,
|
|
56
|
-
'error': f'API Error: {test_response.status_code}',
|
|
57
|
-
'details': test_response.text
|
|
58
|
-
}), test_response.status_code
|
|
59
|
-
|
|
60
|
-
except requests.exceptions.RequestException as e:
|
|
61
|
-
return jsonify({
|
|
62
|
-
'success': False,
|
|
63
|
-
'error': 'Connection error',
|
|
64
|
-
'details': str(e)
|
|
65
|
-
}), 500
|
|
66
|
-
except Exception as e:
|
|
67
|
-
return jsonify({
|
|
68
|
-
'success': False,
|
|
69
|
-
'error': 'Unexpected error',
|
|
70
|
-
'details': str(e)
|
|
71
|
-
}), 500
|
|
72
|
-
|
|
73
|
-
@paper_analysis_bp.route('/api/analyze-paper', methods=['POST'])
|
|
74
|
-
def analyze_paper():
|
|
75
|
-
"""Analyze paper using Deepseek API"""
|
|
76
|
-
try:
|
|
77
|
-
# Get API key from header
|
|
78
|
-
api_key = request.headers.get('X-API-Key')
|
|
79
|
-
if not api_key:
|
|
80
|
-
return jsonify({'error': 'API key is required'}), 401
|
|
81
|
-
|
|
82
|
-
# Get analysis options
|
|
83
|
-
extract_keywords = request.form.get('extract_keywords') == 'true'
|
|
84
|
-
generate_summary = request.form.get('generate_summary') == 'true'
|
|
85
|
-
find_related = request.form.get('find_related') == 'true'
|
|
86
|
-
|
|
87
|
-
# Get uploaded file
|
|
88
|
-
if 'file' not in request.files:
|
|
89
|
-
return jsonify({'error': 'No file uploaded'}), 400
|
|
90
|
-
|
|
91
|
-
file = request.files['file']
|
|
92
|
-
if file.filename == '':
|
|
93
|
-
return jsonify({'error': 'No file selected'}), 400
|
|
94
|
-
|
|
95
|
-
# Check file size (limit to 50MB)
|
|
96
|
-
file.seek(0, 2) # Seek to end
|
|
97
|
-
file_size = file.tell()
|
|
98
|
-
file.seek(0) # Reset to beginning
|
|
99
|
-
|
|
100
|
-
if file_size > 50 * 1024 * 1024: # 50MB limit
|
|
101
|
-
return jsonify({'error': 'File too large. Maximum size is 50MB'}), 400
|
|
102
|
-
|
|
103
|
-
if file_size == 0:
|
|
104
|
-
return jsonify({'error': 'File is empty'}), 400
|
|
105
|
-
|
|
106
|
-
# Save file temporarily
|
|
107
|
-
filename = secure_filename(file.filename)
|
|
108
|
-
print(f"Processing file: {filename} (size: {file_size} bytes)")
|
|
109
|
-
|
|
110
|
-
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as temp_file:
|
|
111
|
-
file.save(temp_file.name)
|
|
112
|
-
file_path = temp_file.name
|
|
113
|
-
|
|
114
|
-
try:
|
|
115
|
-
# Initialize results dictionary
|
|
116
|
-
results = {
|
|
117
|
-
'keywords': [],
|
|
118
|
-
'summary': '',
|
|
119
|
-
'related_works': []
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
# Extract text from file
|
|
123
|
-
text = extract_text_from_file(file_path, filename)
|
|
124
|
-
|
|
125
|
-
if not text or not text.strip():
|
|
126
|
-
return jsonify({'error': 'Could not extract text from the file. The file might be empty or in an unsupported format.'}), 400
|
|
127
|
-
|
|
128
|
-
# Clean up text
|
|
129
|
-
text = text.strip()
|
|
130
|
-
print(f"Final text length before truncation: {len(text)}")
|
|
131
|
-
|
|
132
|
-
# Check if we have enough text
|
|
133
|
-
if len(text) < 100:
|
|
134
|
-
return jsonify({
|
|
135
|
-
'error': 'Extracted text is too short. This might be a scanned PDF without OCR text. Please ensure your PDF contains selectable text, not just images.'
|
|
136
|
-
}), 400
|
|
137
|
-
|
|
138
|
-
# Handle large documents
|
|
139
|
-
text = process_large_document(text)
|
|
140
|
-
|
|
141
|
-
# Call Deepseek API for each requested analysis
|
|
142
|
-
headers = {
|
|
143
|
-
'Authorization': f'Bearer {api_key}',
|
|
144
|
-
'Content-Type': 'application/json'
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
if extract_keywords:
|
|
148
|
-
results['keywords'] = extract_keywords_with_deepseek(text, headers)
|
|
149
|
-
|
|
150
|
-
if generate_summary:
|
|
151
|
-
results['summary'] = generate_summary_with_deepseek(text, headers)
|
|
152
|
-
|
|
153
|
-
if find_related:
|
|
154
|
-
results['related_works'] = extract_formulas_with_deepseek(text, headers)
|
|
155
|
-
|
|
156
|
-
return jsonify(results)
|
|
157
|
-
|
|
158
|
-
finally:
|
|
159
|
-
# Clean up temporary file
|
|
160
|
-
try:
|
|
161
|
-
os.unlink(file_path)
|
|
162
|
-
except Exception as e:
|
|
163
|
-
print(f"Error deleting temporary file: {str(e)}")
|
|
164
|
-
|
|
165
|
-
except Exception as e:
|
|
166
|
-
print(f"Analyze paper error: {str(e)}")
|
|
167
|
-
return jsonify({'error': str(e)}), 500
|
|
168
|
-
|
|
169
|
-
def extract_text_from_file(file_path, filename):
|
|
170
|
-
"""Extract text from various file formats"""
|
|
171
|
-
text = ''
|
|
172
|
-
file_ext = os.path.splitext(filename)[1].lower()
|
|
173
|
-
|
|
174
|
-
try:
|
|
175
|
-
if file_ext == '.pdf':
|
|
176
|
-
text = extract_pdf_text(file_path)
|
|
177
|
-
elif file_ext in ['.docx', '.doc']:
|
|
178
|
-
text = extract_word_text(file_path, file_ext)
|
|
179
|
-
elif file_ext == '.rtf':
|
|
180
|
-
text = extract_rtf_text(file_path)
|
|
181
|
-
elif file_ext in ['.tex', '.latex']:
|
|
182
|
-
text = extract_latex_text(file_path)
|
|
183
|
-
elif file_ext in ['.md', '.markdown']:
|
|
184
|
-
text = extract_markdown_text(file_path)
|
|
185
|
-
else:
|
|
186
|
-
text = extract_plain_text(file_path)
|
|
187
|
-
|
|
188
|
-
except Exception as e:
|
|
189
|
-
print(f"File processing error: {str(e)}")
|
|
190
|
-
raise Exception(f"Error reading file: {str(e)}")
|
|
191
|
-
|
|
192
|
-
return text
|
|
193
|
-
|
|
194
|
-
def extract_pdf_text(file_path):
|
|
195
|
-
"""Extract text from PDF files"""
|
|
196
|
-
try:
|
|
197
|
-
from PyPDF2 import PdfReader
|
|
198
|
-
reader = PdfReader(file_path)
|
|
199
|
-
text = ''
|
|
200
|
-
num_pages = len(reader.pages)
|
|
201
|
-
print(f"PDF has {num_pages} pages")
|
|
202
|
-
|
|
203
|
-
for i, page in enumerate(reader.pages):
|
|
204
|
-
try:
|
|
205
|
-
page_text = page.extract_text()
|
|
206
|
-
if page_text:
|
|
207
|
-
text += page_text + '\n'
|
|
208
|
-
print(f"Extracted page {i+1}/{num_pages}")
|
|
209
|
-
except Exception as page_error:
|
|
210
|
-
print(f"Error extracting page {i+1}: {str(page_error)}")
|
|
211
|
-
continue
|
|
212
|
-
|
|
213
|
-
print(f"Total extracted text length: {len(text)}")
|
|
214
|
-
return text
|
|
215
|
-
|
|
216
|
-
except ImportError:
|
|
217
|
-
# Try alternative PDF library
|
|
218
|
-
try:
|
|
219
|
-
import pdfplumber
|
|
220
|
-
text = ''
|
|
221
|
-
with pdfplumber.open(file_path) as pdf:
|
|
222
|
-
for page in pdf.pages:
|
|
223
|
-
page_text = page.extract_text()
|
|
224
|
-
if page_text:
|
|
225
|
-
text += page_text + '\n'
|
|
226
|
-
return text
|
|
227
|
-
except ImportError:
|
|
228
|
-
raise Exception('PDF processing is not available. Please install PyPDF2 or pdfplumber.')
|
|
229
|
-
except Exception as pdf_error:
|
|
230
|
-
print(f"PDF extraction error: {str(pdf_error)}")
|
|
231
|
-
# Try PyMuPDF as fallback
|
|
232
|
-
try:
|
|
233
|
-
import fitz # PyMuPDF
|
|
234
|
-
pdf_document = fitz.open(file_path)
|
|
235
|
-
text = ''
|
|
236
|
-
for page_num in range(pdf_document.page_count):
|
|
237
|
-
page = pdf_document[page_num]
|
|
238
|
-
text += page.get_text() + '\n'
|
|
239
|
-
pdf_document.close()
|
|
240
|
-
return text
|
|
241
|
-
except ImportError:
|
|
242
|
-
raise Exception(f'Could not extract text from PDF: {str(pdf_error)}. Try installing PyMuPDF.')
|
|
243
|
-
except Exception as mupdf_error:
|
|
244
|
-
raise Exception(f'PDF extraction failed: {str(pdf_error)}')
|
|
245
|
-
|
|
246
|
-
def extract_word_text(file_path, file_ext):
|
|
247
|
-
"""Extract text from Word documents"""
|
|
248
|
-
try:
|
|
249
|
-
if file_ext == '.docx':
|
|
250
|
-
from docx import Document
|
|
251
|
-
doc = Document(file_path)
|
|
252
|
-
return '\n'.join([paragraph.text for paragraph in doc.paragraphs])
|
|
253
|
-
else:
|
|
254
|
-
# .doc files
|
|
255
|
-
try:
|
|
256
|
-
import docx2txt
|
|
257
|
-
return docx2txt.process(file_path)
|
|
258
|
-
except ImportError:
|
|
259
|
-
raise Exception('DOC file support requires docx2txt. Please install it with: pip install docx2txt')
|
|
260
|
-
except ImportError:
|
|
261
|
-
raise Exception('Word document support requires python-docx. Please install it with: pip install python-docx')
|
|
262
|
-
except Exception as docx_error:
|
|
263
|
-
raise Exception(f'Error reading Word document: {str(docx_error)}')
|
|
264
|
-
|
|
265
|
-
def extract_rtf_text(file_path):
|
|
266
|
-
"""Extract text from RTF files"""
|
|
267
|
-
try:
|
|
268
|
-
import striprtf
|
|
269
|
-
with open(file_path, 'r', encoding='utf-8') as f:
|
|
270
|
-
rtf_content = f.read()
|
|
271
|
-
return striprtf.rtf_to_text(rtf_content)
|
|
272
|
-
except ImportError:
|
|
273
|
-
raise Exception('RTF support requires striprtf. Please install it with: pip install striprtf')
|
|
274
|
-
except Exception as rtf_error:
|
|
275
|
-
raise Exception(f'Error reading RTF file: {str(rtf_error)}')
|
|
276
|
-
|
|
277
|
-
def extract_latex_text(file_path):
|
|
278
|
-
"""Extract text from LaTeX files"""
|
|
279
|
-
try:
|
|
280
|
-
with open(file_path, 'r', encoding='utf-8') as f:
|
|
281
|
-
tex_content = f.read()
|
|
282
|
-
# Basic LaTeX cleanup - remove common commands
|
|
283
|
-
import re
|
|
284
|
-
text = tex_content
|
|
285
|
-
# Remove comments
|
|
286
|
-
text = re.sub(r'%.*$', '', text, flags=re.MULTILINE)
|
|
287
|
-
# Remove common LaTeX commands but keep content
|
|
288
|
-
text = re.sub(r'\\(begin|end)\{[^}]+\}', '', text)
|
|
289
|
-
text = re.sub(r'\\[a-zA-Z]+\*?\{([^}]+)\}', r'\1', text)
|
|
290
|
-
text = re.sub(r'\\[a-zA-Z]+\*?', '', text)
|
|
291
|
-
return text
|
|
292
|
-
except Exception as tex_error:
|
|
293
|
-
raise Exception(f'Error reading LaTeX file: {str(tex_error)}')
|
|
294
|
-
|
|
295
|
-
def extract_markdown_text(file_path):
|
|
296
|
-
"""Extract text from Markdown files"""
|
|
297
|
-
try:
|
|
298
|
-
with open(file_path, 'r', encoding='utf-8') as f:
|
|
299
|
-
text = f.read()
|
|
300
|
-
# Clean up markdown syntax
|
|
301
|
-
import re
|
|
302
|
-
# Remove image links
|
|
303
|
-
text = re.sub(r'!\[[^\]]*\]\([^)]+\)', '', text)
|
|
304
|
-
# Convert links to just text
|
|
305
|
-
text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
|
|
306
|
-
return text
|
|
307
|
-
except Exception as md_error:
|
|
308
|
-
raise Exception(f'Error reading Markdown file: {str(md_error)}')
|
|
309
|
-
|
|
310
|
-
def extract_plain_text(file_path):
|
|
311
|
-
"""Extract text from plain text files"""
|
|
312
|
-
encodings = ['utf-8', 'utf-16', 'gbk', 'gb2312', 'big5', 'latin-1']
|
|
313
|
-
text = None
|
|
314
|
-
|
|
315
|
-
for encoding in encodings:
|
|
316
|
-
try:
|
|
317
|
-
with open(file_path, 'r', encoding=encoding) as f:
|
|
318
|
-
text = f.read()
|
|
319
|
-
print(f"Successfully read file with {encoding} encoding")
|
|
320
|
-
break
|
|
321
|
-
except UnicodeDecodeError:
|
|
322
|
-
continue
|
|
323
|
-
except Exception as e:
|
|
324
|
-
print(f"Error reading with {encoding}: {str(e)}")
|
|
325
|
-
continue
|
|
326
|
-
|
|
327
|
-
if text is None:
|
|
328
|
-
# Try reading as binary and decode
|
|
329
|
-
with open(file_path, 'rb') as f:
|
|
330
|
-
binary_content = f.read()
|
|
331
|
-
try:
|
|
332
|
-
text = binary_content.decode('utf-8', errors='ignore')
|
|
333
|
-
except:
|
|
334
|
-
text = str(binary_content)
|
|
335
|
-
|
|
336
|
-
return text
|
|
337
|
-
|
|
338
|
-
def process_large_document(text):
|
|
339
|
-
"""Process large documents by prioritizing formula extraction"""
|
|
340
|
-
if len(text) > 98000:
|
|
341
|
-
print("Large document detected, prioritizing content for formula extraction")
|
|
342
|
-
# Try to find sections with formulas (common patterns)
|
|
343
|
-
import re
|
|
344
|
-
# Look for mathematical content indicators
|
|
345
|
-
math_sections = []
|
|
346
|
-
lines = text.split('\n')
|
|
347
|
-
for i, line in enumerate(lines):
|
|
348
|
-
if re.search(r'[=+\-*/∫∑∏√∂∇∆λμσπ]|equation|formula|theorem|lemma|proof', line, re.IGNORECASE):
|
|
349
|
-
# Include surrounding context
|
|
350
|
-
start = max(0, i-5)
|
|
351
|
-
end = min(len(lines), i+6)
|
|
352
|
-
math_sections.extend(lines[start:end])
|
|
353
|
-
|
|
354
|
-
if math_sections:
|
|
355
|
-
# Use math-rich sections for better formula extraction
|
|
356
|
-
math_text = '\n'.join(math_sections)
|
|
357
|
-
if len(math_text) > 50000: # Still too long
|
|
358
|
-
text = math_text[:98000]
|
|
359
|
-
else:
|
|
360
|
-
# Combine math sections with beginning of document
|
|
361
|
-
remaining_space = 98000 - len(math_text)
|
|
362
|
-
text = text[:remaining_space] + '\n\n[Mathematical content sections:]\n' + math_text
|
|
363
|
-
else:
|
|
364
|
-
# No math indicators found, use first part
|
|
365
|
-
text = text[:98000]
|
|
366
|
-
|
|
367
|
-
return text
|
|
368
|
-
|
|
369
|
-
def extract_keywords_with_deepseek(text, headers):
|
|
370
|
-
"""Extract keywords using Deepseek API"""
|
|
371
|
-
try:
|
|
372
|
-
keyword_messages = [
|
|
373
|
-
{
|
|
374
|
-
'role': 'system',
|
|
375
|
-
'content': 'You are a helpful assistant that extracts keywords from academic papers. Always respond with valid JSON.'
|
|
376
|
-
},
|
|
377
|
-
{
|
|
378
|
-
'role': 'user',
|
|
379
|
-
'content': f"""Analyze the following academic paper and extract the key technical terms and concepts.
|
|
380
|
-
For each keyword, provide a relevance score between 0 and 1.
|
|
381
|
-
Return ONLY a valid JSON array of objects with 'text' and 'score' properties.
|
|
382
|
-
Example format: [{{"text": "machine learning", "score": 0.95}}, {{"text": "neural networks", "score": 0.85}}]
|
|
383
|
-
|
|
384
|
-
Paper text:
|
|
385
|
-
{text}"""
|
|
386
|
-
}
|
|
387
|
-
]
|
|
388
|
-
|
|
389
|
-
keyword_response = requests.post(
|
|
390
|
-
'https://api.deepseek.com/v1/chat/completions',
|
|
391
|
-
headers=headers,
|
|
392
|
-
json={
|
|
393
|
-
'model': 'deepseek-chat',
|
|
394
|
-
'messages': keyword_messages,
|
|
395
|
-
'temperature': 0.3,
|
|
396
|
-
'max_tokens': 4000
|
|
397
|
-
},
|
|
398
|
-
timeout=60
|
|
399
|
-
)
|
|
400
|
-
|
|
401
|
-
if keyword_response.ok:
|
|
402
|
-
response_content = keyword_response.json()['choices'][0]['message']['content']
|
|
403
|
-
try:
|
|
404
|
-
# Try to extract JSON from the response
|
|
405
|
-
import re
|
|
406
|
-
json_match = re.search(r'\[.*\]', response_content, re.DOTALL)
|
|
407
|
-
if json_match:
|
|
408
|
-
return json.loads(json_match.group())
|
|
409
|
-
else:
|
|
410
|
-
return json.loads(response_content)
|
|
411
|
-
except json.JSONDecodeError:
|
|
412
|
-
print(f"Invalid JSON from keywords API: {response_content}")
|
|
413
|
-
return []
|
|
414
|
-
else:
|
|
415
|
-
print(f"Keywords API error: {keyword_response.text}")
|
|
416
|
-
return []
|
|
417
|
-
|
|
418
|
-
except Exception as e:
|
|
419
|
-
print(f"Error in keywords extraction: {str(e)}")
|
|
420
|
-
return []
|
|
421
|
-
|
|
422
|
-
def generate_summary_with_deepseek(text, headers):
|
|
423
|
-
"""Generate summary using Deepseek API"""
|
|
424
|
-
try:
|
|
425
|
-
summary_messages = [
|
|
426
|
-
{
|
|
427
|
-
'role': 'system',
|
|
428
|
-
'content': 'You are a helpful assistant that summarizes academic papers.'
|
|
429
|
-
},
|
|
430
|
-
{
|
|
431
|
-
'role': 'user',
|
|
432
|
-
'content': f"""Provide a comprehensive summary of the following academic paper.
|
|
433
|
-
Focus on the main contributions, methodology, and key findings.
|
|
434
|
-
Keep the response concise and well-structured.
|
|
435
|
-
|
|
436
|
-
Paper text:
|
|
437
|
-
{text}"""
|
|
438
|
-
}
|
|
439
|
-
]
|
|
440
|
-
|
|
441
|
-
summary_response = requests.post(
|
|
442
|
-
'https://api.deepseek.com/v1/chat/completions',
|
|
443
|
-
headers=headers,
|
|
444
|
-
json={
|
|
445
|
-
'model': 'deepseek-chat',
|
|
446
|
-
'messages': summary_messages,
|
|
447
|
-
'temperature': 0.3,
|
|
448
|
-
'max_tokens': 4000
|
|
449
|
-
},
|
|
450
|
-
timeout=60
|
|
451
|
-
)
|
|
452
|
-
|
|
453
|
-
if summary_response.ok:
|
|
454
|
-
return summary_response.json()['choices'][0]['message']['content']
|
|
455
|
-
else:
|
|
456
|
-
print(f"Summary API error: {summary_response.text}")
|
|
457
|
-
return "Error generating summary"
|
|
458
|
-
|
|
459
|
-
except Exception as e:
|
|
460
|
-
print(f"Error in summary generation: {str(e)}")
|
|
461
|
-
return "Error generating summary"
|
|
462
|
-
|
|
463
|
-
def extract_formulas_with_deepseek(text, headers):
|
|
464
|
-
"""Extract formulas using Deepseek API"""
|
|
465
|
-
try:
|
|
466
|
-
related_messages = [
|
|
467
|
-
{
|
|
468
|
-
'role': 'system',
|
|
469
|
-
'content': '''You are an expert mathematician and AI assistant specialized in extracting mathematical formulas from academic papers.
|
|
470
|
-
Your task is to identify and extract ALL mathematical formulas, equations, and mathematical expressions from the given text, try as much as you can.
|
|
471
|
-
|
|
472
|
-
IMPORTANT INSTRUCTIONS:
|
|
473
|
-
1. Extract EVERY mathematical formula, equation, or expression you find
|
|
474
|
-
2. Include inline formulas, displayed equations, and mathematical definitions
|
|
475
|
-
3. Preserve the original notation as much as possible
|
|
476
|
-
4. For each formula, provide context about what it represents
|
|
477
|
-
5. Always respond with valid JSON format
|
|
478
|
-
|
|
479
|
-
You must be thorough and extract ALL formulas, not just the main ones.'''
|
|
480
|
-
},
|
|
481
|
-
{
|
|
482
|
-
'role': 'user',
|
|
483
|
-
'content': f"""Extract ALL mathematical formulas and equations from the following paper text.
|
|
484
|
-
|
|
485
|
-
For each formula found, provide:
|
|
486
|
-
- The formula itself (in LaTeX notation if possible)
|
|
487
|
-
- A detailed description explaining what the formula represents and what each variable means
|
|
488
|
-
- The context or section where it appears
|
|
489
|
-
- Whether it's a definition, theorem, lemma, or general equation
|
|
490
|
-
- A Chinese description that explains the formula's purpose
|
|
491
|
-
|
|
492
|
-
Return a JSON array where each element has these properties:
|
|
493
|
-
- "formula": The mathematical expression (use LaTeX notation)
|
|
494
|
-
- "description": What the formula represents or calculates
|
|
495
|
-
- "variables": Detailed explanation of what each variable/symbol means in the formula
|
|
496
|
-
- "variables_chinese": Chinese translation of variable explanations (same structure as variables)
|
|
497
|
-
- "type": One of ["definition", "theorem", "lemma", "equation", "inequality", "identity", "other"]
|
|
498
|
-
- "context": Brief context about where/how it's used
|
|
499
|
-
- "chinese_description": A comprehensive Chinese description of the formula and its purpose
|
|
500
|
-
|
|
501
|
-
Example format:
|
|
502
|
-
[
|
|
503
|
-
{{
|
|
504
|
-
"formula": "E = mc^2",
|
|
505
|
-
"description": "Einstein's mass-energy equivalence relation",
|
|
506
|
-
"variables": {{"E": "energy (joules)", "m": "mass (kilograms)", "c": "speed of light in vacuum (≈3×10^8 m/s)"}},
|
|
507
|
-
"variables_chinese": {{"E": "能量 (焦耳)", "m": "质量 (千克)", "c": "真空中的光速 (≈3×10^8 m/s)"}},
|
|
508
|
-
"type": "equation",
|
|
509
|
-
"context": "Fundamental equation in special relativity theory",
|
|
510
|
-
"chinese_description": "爱因斯坦质能等价公式,表示质量和能量之间的等价关系"
|
|
511
|
-
}},
|
|
512
|
-
{{
|
|
513
|
-
"formula": "F = ma",
|
|
514
|
-
"description": "Newton's second law of motion",
|
|
515
|
-
"variables": {{"F": "net force (newtons)", "m": "mass (kilograms)", "a": "acceleration (m/s²)"}},
|
|
516
|
-
"variables_chinese": {{"F": "净力 (牛顿)", "m": "质量 (千克)", "a": "加速度 (m/s²)"}},
|
|
517
|
-
"type": "equation",
|
|
518
|
-
"context": "Classical mechanics fundamental law",
|
|
519
|
-
"chinese_description": "牛顿第二定律,描述物体受力与加速度的关系"
|
|
520
|
-
}}
|
|
521
|
-
]
|
|
522
|
-
|
|
523
|
-
Paper text:
|
|
524
|
-
{text}
|
|
525
|
-
|
|
526
|
-
IMPORTANT INSTRUCTIONS:
|
|
527
|
-
1. Extract EVERY formula, even simple ones like "x + y = z" or "f(x) = ax + b"
|
|
528
|
-
2. For each variable or symbol in the formula, explain what it represents
|
|
529
|
-
3. Include units of measurement when relevant
|
|
530
|
-
4. Provide comprehensive Chinese descriptions that explain the formula's significance
|
|
531
|
-
5. Be thorough and detailed in variable explanations"""
|
|
532
|
-
}
|
|
533
|
-
]
|
|
534
|
-
|
|
535
|
-
related_response = requests.post(
|
|
536
|
-
'https://api.deepseek.com/v1/chat/completions',
|
|
537
|
-
headers=headers,
|
|
538
|
-
json={
|
|
539
|
-
'model': 'deepseek-chat',
|
|
540
|
-
'messages': related_messages,
|
|
541
|
-
'temperature': 0.1, # Lower temperature for more consistent extraction
|
|
542
|
-
'max_tokens': 4000 # Increased token limit for more formulas
|
|
543
|
-
},
|
|
544
|
-
timeout=120 # Increased timeout for large documents
|
|
545
|
-
)
|
|
546
|
-
|
|
547
|
-
if related_response.ok:
|
|
548
|
-
response_content = related_response.json()['choices'][0]['message']['content']
|
|
549
|
-
try:
|
|
550
|
-
# Try to extract JSON from the response
|
|
551
|
-
import re
|
|
552
|
-
# Look for JSON array in the response
|
|
553
|
-
json_match = re.search(r'\[[\s\S]*\]', response_content)
|
|
554
|
-
if json_match:
|
|
555
|
-
formulas = json.loads(json_match.group())
|
|
556
|
-
return formulas
|
|
557
|
-
else:
|
|
558
|
-
# Try direct JSON parsing
|
|
559
|
-
return json.loads(response_content)
|
|
560
|
-
except json.JSONDecodeError as e:
|
|
561
|
-
print(f"Invalid JSON from formulas API: {response_content}")
|
|
562
|
-
print(f"JSON Error: {str(e)}")
|
|
563
|
-
return []
|
|
564
|
-
else:
|
|
565
|
-
print(f"Formulas API error: {related_response.text}")
|
|
566
|
-
return []
|
|
567
|
-
|
|
568
|
-
except Exception as e:
|
|
569
|
-
print(f"Error in formula extraction: {str(e)}")
|
|
570
|
-
return []
|