siat 3.10.132__py3-none-any.whl → 3.10.133__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- siat/__init__.py +0 -0
- siat/allin.py +0 -0
- siat/assets_liquidity.py +0 -0
- siat/beta_adjustment.py +0 -0
- siat/beta_adjustment_china.py +0 -0
- siat/blockchain.py +0 -0
- siat/bond.py +0 -0
- siat/bond_base.py +0 -0
- siat/bond_china.py +0 -0
- siat/bond_zh_sina.py +0 -0
- siat/capm_beta.py +0 -0
- siat/capm_beta2.py +0 -0
- siat/compare_cross.py +0 -0
- siat/copyrights.py +0 -0
- siat/cryptocurrency.py +0 -0
- siat/economy.py +0 -0
- siat/economy2.py +0 -0
- siat/esg.py +0 -0
- siat/event_study.py +0 -0
- siat/exchange_bond_china.pickle +0 -0
- siat/fama_french.py +0 -0
- siat/fin_stmt2_yahoo.py +0 -0
- siat/financial_base.py +0 -0
- siat/financial_statements.py +0 -0
- siat/financials.py +0 -0
- siat/financials2.py +0 -0
- siat/financials_china.py +0 -0
- siat/financials_china2.py +0 -0
- siat/fund.py +0 -0
- siat/fund_china.pickle +0 -0
- siat/fund_china.py +0 -0
- siat/future_china.py +0 -0
- siat/google_authenticator.py +0 -0
- siat/grafix.py +0 -0
- siat/holding_risk.py +0 -0
- siat/luchy_draw.py +0 -0
- siat/market_china.py +0 -0
- siat/markowitz.py +0 -0
- siat/markowitz2.py +0 -0
- siat/markowitz2_20250704.py +0 -0
- siat/markowitz2_20250705.py +0 -0
- siat/markowitz_simple.py +0 -0
- siat/ml_cases.py +0 -0
- siat/ml_cases_example.py +0 -0
- siat/option_china.py +0 -0
- siat/option_pricing.py +0 -0
- siat/other_indexes.py +0 -0
- siat/risk_adjusted_return.py +0 -0
- siat/risk_adjusted_return2.py +0 -0
- siat/risk_evaluation.py +0 -0
- siat/risk_free_rate.py +0 -0
- siat/sector_china.py +0 -0
- siat/security_price2.py +0 -0
- siat/security_prices.py +40 -2
- siat/security_trend.py +0 -0
- siat/security_trend2.py +0 -0
- siat/stock.py +0 -0
- siat/stock_advice_linear.py +0 -0
- siat/stock_base.py +0 -0
- siat/stock_china.py +0 -0
- siat/stock_info.pickle +0 -0
- siat/stock_prices_kneighbors.py +0 -0
- siat/stock_prices_linear.py +0 -0
- siat/stock_profile.py +0 -0
- siat/stock_technical.py +0 -0
- siat/stooq.py +0 -0
- siat/transaction.py +0 -0
- siat/translate.py +0 -0
- siat/valuation.py +0 -0
- siat/valuation_china.py +0 -0
- siat/var_model_validation.py +0 -0
- siat/yf_name.py +0 -0
- {siat-3.10.132.dist-info/licenses → siat-3.10.133.dist-info}/LICENSE +0 -0
- {siat-3.10.132.dist-info → siat-3.10.133.dist-info}/METADATA +232 -235
- siat-3.10.133.dist-info/RECORD +78 -0
- {siat-3.10.132.dist-info → siat-3.10.133.dist-info}/WHEEL +1 -1
- {siat-3.10.132.dist-info → siat-3.10.133.dist-info}/top_level.txt +0 -1
- build/lib/build/lib/siat/__init__.py +0 -75
- build/lib/build/lib/siat/allin.py +0 -137
- build/lib/build/lib/siat/assets_liquidity.py +0 -915
- build/lib/build/lib/siat/beta_adjustment.py +0 -1058
- build/lib/build/lib/siat/beta_adjustment_china.py +0 -548
- build/lib/build/lib/siat/blockchain.py +0 -143
- build/lib/build/lib/siat/bond.py +0 -2900
- build/lib/build/lib/siat/bond_base.py +0 -992
- build/lib/build/lib/siat/bond_china.py +0 -100
- build/lib/build/lib/siat/bond_zh_sina.py +0 -143
- build/lib/build/lib/siat/capm_beta.py +0 -783
- build/lib/build/lib/siat/capm_beta2.py +0 -887
- build/lib/build/lib/siat/common.py +0 -5360
- build/lib/build/lib/siat/compare_cross.py +0 -642
- build/lib/build/lib/siat/copyrights.py +0 -18
- build/lib/build/lib/siat/cryptocurrency.py +0 -667
- build/lib/build/lib/siat/economy.py +0 -1471
- build/lib/build/lib/siat/economy2.py +0 -1853
- build/lib/build/lib/siat/esg.py +0 -536
- build/lib/build/lib/siat/event_study.py +0 -815
- build/lib/build/lib/siat/fama_french.py +0 -1521
- build/lib/build/lib/siat/fin_stmt2_yahoo.py +0 -982
- build/lib/build/lib/siat/financial_base.py +0 -1160
- build/lib/build/lib/siat/financial_statements.py +0 -598
- build/lib/build/lib/siat/financials.py +0 -2339
- build/lib/build/lib/siat/financials2.py +0 -1278
- build/lib/build/lib/siat/financials_china.py +0 -4433
- build/lib/build/lib/siat/financials_china2.py +0 -2212
- build/lib/build/lib/siat/fund.py +0 -629
- build/lib/build/lib/siat/fund_china.py +0 -3307
- build/lib/build/lib/siat/future_china.py +0 -551
- build/lib/build/lib/siat/google_authenticator.py +0 -47
- build/lib/build/lib/siat/grafix.py +0 -3636
- build/lib/build/lib/siat/holding_risk.py +0 -867
- build/lib/build/lib/siat/luchy_draw.py +0 -638
- build/lib/build/lib/siat/market_china.py +0 -1168
- build/lib/build/lib/siat/markowitz.py +0 -2363
- build/lib/build/lib/siat/markowitz2.py +0 -3150
- build/lib/build/lib/siat/markowitz2_20250704.py +0 -2969
- build/lib/build/lib/siat/markowitz2_20250705.py +0 -3158
- build/lib/build/lib/siat/markowitz_simple.py +0 -373
- build/lib/build/lib/siat/ml_cases.py +0 -2291
- build/lib/build/lib/siat/ml_cases_example.py +0 -60
- build/lib/build/lib/siat/option_china.py +0 -3069
- build/lib/build/lib/siat/option_pricing.py +0 -1925
- build/lib/build/lib/siat/other_indexes.py +0 -409
- build/lib/build/lib/siat/risk_adjusted_return.py +0 -1576
- build/lib/build/lib/siat/risk_adjusted_return2.py +0 -1900
- build/lib/build/lib/siat/risk_evaluation.py +0 -2218
- build/lib/build/lib/siat/risk_free_rate.py +0 -351
- build/lib/build/lib/siat/sector_china.py +0 -4140
- build/lib/build/lib/siat/security_price2.py +0 -727
- build/lib/build/lib/siat/security_prices.py +0 -3408
- build/lib/build/lib/siat/security_trend.py +0 -402
- build/lib/build/lib/siat/security_trend2.py +0 -646
- build/lib/build/lib/siat/stock.py +0 -4284
- build/lib/build/lib/siat/stock_advice_linear.py +0 -934
- build/lib/build/lib/siat/stock_base.py +0 -26
- build/lib/build/lib/siat/stock_china.py +0 -2095
- build/lib/build/lib/siat/stock_prices_kneighbors.py +0 -910
- build/lib/build/lib/siat/stock_prices_linear.py +0 -386
- build/lib/build/lib/siat/stock_profile.py +0 -707
- build/lib/build/lib/siat/stock_technical.py +0 -3305
- build/lib/build/lib/siat/stooq.py +0 -74
- build/lib/build/lib/siat/transaction.py +0 -347
- build/lib/build/lib/siat/translate.py +0 -5183
- build/lib/build/lib/siat/valuation.py +0 -1378
- build/lib/build/lib/siat/valuation_china.py +0 -2076
- build/lib/build/lib/siat/var_model_validation.py +0 -444
- build/lib/build/lib/siat/yf_name.py +0 -811
- build/lib/siat/__init__.py +0 -75
- build/lib/siat/allin.py +0 -137
- build/lib/siat/assets_liquidity.py +0 -915
- build/lib/siat/beta_adjustment.py +0 -1058
- build/lib/siat/beta_adjustment_china.py +0 -548
- build/lib/siat/blockchain.py +0 -143
- build/lib/siat/bond.py +0 -2900
- build/lib/siat/bond_base.py +0 -992
- build/lib/siat/bond_china.py +0 -100
- build/lib/siat/bond_zh_sina.py +0 -143
- build/lib/siat/capm_beta.py +0 -783
- build/lib/siat/capm_beta2.py +0 -887
- build/lib/siat/common.py +0 -5360
- build/lib/siat/compare_cross.py +0 -642
- build/lib/siat/copyrights.py +0 -18
- build/lib/siat/cryptocurrency.py +0 -667
- build/lib/siat/economy.py +0 -1471
- build/lib/siat/economy2.py +0 -1853
- build/lib/siat/esg.py +0 -536
- build/lib/siat/event_study.py +0 -815
- build/lib/siat/fama_french.py +0 -1521
- build/lib/siat/fin_stmt2_yahoo.py +0 -982
- build/lib/siat/financial_base.py +0 -1160
- build/lib/siat/financial_statements.py +0 -598
- build/lib/siat/financials.py +0 -2339
- build/lib/siat/financials2.py +0 -1278
- build/lib/siat/financials_china.py +0 -4433
- build/lib/siat/financials_china2.py +0 -2212
- build/lib/siat/fund.py +0 -629
- build/lib/siat/fund_china.py +0 -3307
- build/lib/siat/future_china.py +0 -551
- build/lib/siat/google_authenticator.py +0 -47
- build/lib/siat/grafix.py +0 -3636
- build/lib/siat/holding_risk.py +0 -867
- build/lib/siat/luchy_draw.py +0 -638
- build/lib/siat/market_china.py +0 -1168
- build/lib/siat/markowitz.py +0 -2363
- build/lib/siat/markowitz2.py +0 -3150
- build/lib/siat/markowitz2_20250704.py +0 -2969
- build/lib/siat/markowitz2_20250705.py +0 -3158
- build/lib/siat/markowitz_simple.py +0 -373
- build/lib/siat/ml_cases.py +0 -2291
- build/lib/siat/ml_cases_example.py +0 -60
- build/lib/siat/option_china.py +0 -3069
- build/lib/siat/option_pricing.py +0 -1925
- build/lib/siat/other_indexes.py +0 -409
- build/lib/siat/risk_adjusted_return.py +0 -1576
- build/lib/siat/risk_adjusted_return2.py +0 -1900
- build/lib/siat/risk_evaluation.py +0 -2218
- build/lib/siat/risk_free_rate.py +0 -351
- build/lib/siat/sector_china.py +0 -4140
- build/lib/siat/security_price2.py +0 -727
- build/lib/siat/security_prices.py +0 -3408
- build/lib/siat/security_trend.py +0 -402
- build/lib/siat/security_trend2.py +0 -646
- build/lib/siat/stock.py +0 -4284
- build/lib/siat/stock_advice_linear.py +0 -934
- build/lib/siat/stock_base.py +0 -26
- build/lib/siat/stock_china.py +0 -2095
- build/lib/siat/stock_prices_kneighbors.py +0 -910
- build/lib/siat/stock_prices_linear.py +0 -386
- build/lib/siat/stock_profile.py +0 -707
- build/lib/siat/stock_technical.py +0 -3305
- build/lib/siat/stooq.py +0 -74
- build/lib/siat/transaction.py +0 -347
- build/lib/siat/translate.py +0 -5183
- build/lib/siat/valuation.py +0 -1378
- build/lib/siat/valuation_china.py +0 -2076
- build/lib/siat/var_model_validation.py +0 -444
- build/lib/siat/yf_name.py +0 -811
- siat-3.10.132.dist-info/RECORD +0 -218
@@ -1,2291 +0,0 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
|
3
|
-
"""
|
4
|
-
本模块功能:机器学习课程演示案例和基础函数,仅限课堂案例演示用
|
5
|
-
创建日期:2022年3月25日
|
6
|
-
最新修订日期:
|
7
|
-
作者:王德宏 (WANG Dehong, Peter)
|
8
|
-
作者单位:北京外国语大学国际商学院
|
9
|
-
用途限制:仅限研究与教学使用,不可商用!商用需要额外授权。
|
10
|
-
特别声明:作者不对使用本工具进行证券投资导致的任何损益负责!
|
11
|
-
"""
|
12
|
-
|
13
|
-
#==============================================================================
|
14
|
-
#关闭所有警告
|
15
|
-
import warnings; warnings.filterwarnings('ignore')
|
16
|
-
import warnings; warnings.filterwarnings('ignore')
|
17
|
-
from siat.common import *
|
18
|
-
from siat.translate import *
|
19
|
-
from siat.grafix import *
|
20
|
-
from siat.security_prices import *
|
21
|
-
|
22
|
-
#==============================================================================
|
23
|
-
def ml_demo_iris_matrix():
|
24
|
-
"""
|
25
|
-
功能:机器学习开门课程的演示案例,显示影响鸢尾花品种识别的矩阵图
|
26
|
-
输入:无
|
27
|
-
显示:鸢尾花花萼/花瓣长宽特征对于品种分类的影响
|
28
|
-
输出:无
|
29
|
-
"""
|
30
|
-
|
31
|
-
#设置绘图时的汉字显示
|
32
|
-
import matplotlib.pyplot as plt
|
33
|
-
plt.rcParams['font.sans-serif'] = ['FangSong']
|
34
|
-
plt.rcParams['axes.unicode_minus'] = False
|
35
|
-
|
36
|
-
#装入鸢尾花数据集
|
37
|
-
from sklearn.datasets import load_iris
|
38
|
-
iris_dataset = load_iris()
|
39
|
-
|
40
|
-
#分割样本为两部分:训练集和测试集
|
41
|
-
from sklearn.model_selection import train_test_split
|
42
|
-
X_train, X_test, y_train, y_test = train_test_split(iris_dataset.data, iris_dataset.target, random_state=0)
|
43
|
-
|
44
|
-
#为了绘图需要,将样本转换成数据表格式
|
45
|
-
import pandas as pd
|
46
|
-
iris_dataframe = pd.DataFrame(X_train, columns=iris_dataset.feature_names)
|
47
|
-
iris_dataframe.rename(columns={'sepal length (cm)':'花萼长(厘米)','sepal width (cm)':'花萼宽(厘米)','petal length (cm)':'花瓣长(厘米)','petal width (cm)':'花瓣宽(厘米)'}, inplace = True)
|
48
|
-
|
49
|
-
#绘制训练集的散点图,观察特征值对于标签的区分度
|
50
|
-
import mglearn
|
51
|
-
import pandas.plotting as pdp
|
52
|
-
#figsize为画布大小,marker为散点形状,s为散点大小,alpha为透明度,bins为直方图的条数,cmap为调色板(colormap),c为着色对象(根据着色对象的不同值着不同颜色)
|
53
|
-
grr = pdp.scatter_matrix(iris_dataframe, c=y_train, figsize=(15, 15), marker='o',hist_kwds={'bins': 20}, s=60, alpha=.8, cmap=mglearn.cm3)
|
54
|
-
#换个调色板试试:红绿粉色
|
55
|
-
grr = pdp.scatter_matrix(iris_dataframe, c=y_train, figsize=(15, 15), marker='o',hist_kwds={'bins': 20}, s=60, alpha=.8, cmap=plt.cm.gist_rainbow)
|
56
|
-
#再换个调色板试试:蓝绿锗色
|
57
|
-
grr = pdp.scatter_matrix(iris_dataframe, c=y_train, figsize=(15, 15), marker='o',hist_kwds={'bins': 20}, s=60, alpha=.8, cmap='jet')
|
58
|
-
|
59
|
-
return
|
60
|
-
|
61
|
-
|
62
|
-
if __name__=='__main__':
|
63
|
-
ml_demo_iris_matrix()
|
64
|
-
|
65
|
-
if __name__=='__main__':
|
66
|
-
n_neighbors=5
|
67
|
-
weights='uniform'
|
68
|
-
algorithm='auto'
|
69
|
-
leaf_size=30
|
70
|
-
p=2
|
71
|
-
metric='minkowski'
|
72
|
-
metric_params=None
|
73
|
-
n_jobs=None
|
74
|
-
|
75
|
-
def ml_demo_iris_learn(n_neighbors=5,weights='uniform',algorithm='auto', \
|
76
|
-
leaf_size=30,p=2,metric='minkowski',metric_params=None,n_jobs=None):
|
77
|
-
"""
|
78
|
-
功能:机器学习开门课程的演示案例,学习鸢尾花品种识别
|
79
|
-
输入:无
|
80
|
-
显示:学习效果,有无过拟合
|
81
|
-
输出:学习后的模型,用于预测未知的鸢尾花品种
|
82
|
-
注意:需要安装mglearn轮子
|
83
|
-
"""
|
84
|
-
print("\n 开始进行鸢尾花品种识别 ... ...")
|
85
|
-
|
86
|
-
#设置绘图时的汉字显示
|
87
|
-
import matplotlib.pyplot as plt
|
88
|
-
plt.rcParams['font.sans-serif'] = ['FangSong']
|
89
|
-
plt.rcParams['axes.unicode_minus'] = False
|
90
|
-
|
91
|
-
#装入鸢尾花数据集
|
92
|
-
print(" 装入供学习的鸢尾花品种样本 ...")
|
93
|
-
from sklearn.datasets import load_iris
|
94
|
-
iris_dataset = load_iris()
|
95
|
-
print(" 鸢尾花品种样本总数:",len(iris_dataset.data))
|
96
|
-
print(" 鸢尾花品种名称:",iris_dataset.target_names)
|
97
|
-
print(" 鸢尾花特征名称:",iris_dataset.feature_names)
|
98
|
-
|
99
|
-
#分割样本为两部分:训练集和测试集
|
100
|
-
print("\n 将鸢尾花品种样本按3:1随机拆分训练集和测试集 ...")
|
101
|
-
from sklearn.model_selection import train_test_split
|
102
|
-
X_train, X_test, y_train, y_test = train_test_split(iris_dataset.data, iris_dataset.target, random_state=0)
|
103
|
-
print(" 训练集/测试集样本总数:",len(y_train.data),'/',len(y_test.data))
|
104
|
-
|
105
|
-
|
106
|
-
# 引入最近邻分类模型:
|
107
|
-
from sklearn.neighbors import KNeighborsClassifier
|
108
|
-
#将模型实例化(初始化)
|
109
|
-
print("\n 开始学习训练集:使用最近邻模型 ...")
|
110
|
-
knn = KNeighborsClassifier(n_neighbors=n_neighbors,weights=weights, \
|
111
|
-
algorithm=algorithm,leaf_size=leaf_size,p=p,metric=metric, \
|
112
|
-
metric_params=metric_params,n_jobs=n_jobs)
|
113
|
-
print(" 模型的邻居个数、权重、距离测度:",n_neighbors,'\b,',weights,'\b,',metric)
|
114
|
-
|
115
|
-
#让模型“学习”训练集增长“见识”,以便能够识别未知的鸢尾花品种(回归+拟合)
|
116
|
-
knn.fit(X_train, y_train)
|
117
|
-
|
118
|
-
#学习结果成绩:97.32%
|
119
|
-
train_score=knn.score(X_train, y_train)
|
120
|
-
print(" 训练集学习完毕,识别率:",round(train_score,4))
|
121
|
-
|
122
|
-
#使用测试集评估模型的“学习成绩”:
|
123
|
-
test_score=knn.score(X_test, y_test)
|
124
|
-
print(" 使用未知的测试集进行检验,识别率:",round(test_score,4))
|
125
|
-
|
126
|
-
#返回模型
|
127
|
-
return knn
|
128
|
-
|
129
|
-
if __name__=='__main__':
|
130
|
-
model=knn
|
131
|
-
new_iris=[5,2.9,1,0.2]
|
132
|
-
|
133
|
-
def ml_demo_iris_check(model,new_iris):
|
134
|
-
"""
|
135
|
-
功能:基于已学习的模型model,预测一株鸢尾花new_iris属于什么品种
|
136
|
-
输入:模型,新鸢尾花的数据[花萼长,花萼宽,花瓣长,花瓣宽]
|
137
|
-
显示:预测的品种名称
|
138
|
-
"""
|
139
|
-
print("\n 开始进行鸢尾花品种识别 ... ...")
|
140
|
-
print(" 新鸢尾花的特征:花萼花瓣长宽为",new_iris)
|
141
|
-
|
142
|
-
#构造一株新鸢尾花的特征数据
|
143
|
-
import numpy as np
|
144
|
-
X_new = np.array([new_iris])
|
145
|
-
|
146
|
-
#利用机器学习的经验进行品种识别
|
147
|
-
prediction = model.predict(X_new)
|
148
|
-
probability = model.predict_proba(X_new)
|
149
|
-
|
150
|
-
#显示识别的结果:鸢尾花的品种名称
|
151
|
-
iris_names=['setosa','versicolor','virginica']
|
152
|
-
id=prediction[0]
|
153
|
-
iris_name=iris_names[id]
|
154
|
-
print(" 基于机器学习的结果,判断该鸢尾花的品种:",iris_name)
|
155
|
-
print(" 判断该鸢尾花品种的把握度:",probability[0,0]*100,'\b%')
|
156
|
-
|
157
|
-
return
|
158
|
-
|
159
|
-
if __name__=='__main__':
|
160
|
-
knn=ml_demo_iris_learn()
|
161
|
-
ml_demo_iris_check(knn,new_iris)
|
162
|
-
|
163
|
-
|
164
|
-
#==============================================================================
|
165
|
-
#定义函数:欧几里得距离
|
166
|
-
from math import *
|
167
|
-
def eculidean_distance(xi, xj):
|
168
|
-
distance = sqrt(sum(pow(a - b, 2) for a, b in zip(xi, xj)))
|
169
|
-
return distance
|
170
|
-
|
171
|
-
if __name__ =="__main__":
|
172
|
-
#示例:结果为3.873
|
173
|
-
xi = [1, 3, 2, 4]
|
174
|
-
xj = [2, 5, 3, 1]
|
175
|
-
print(eculidean_distance(xi, xj))
|
176
|
-
|
177
|
-
#定义函数:曼哈顿距离
|
178
|
-
def manhattan_distance(xi, xj):
|
179
|
-
distance = sum(abs(a - b) for a, b in zip(xi, xj))
|
180
|
-
return distance
|
181
|
-
|
182
|
-
if __name__ =="__main__":
|
183
|
-
#示例:结果为7
|
184
|
-
xi = [1, 3, 2, 4]
|
185
|
-
xj = [2, 5, 3, 1]
|
186
|
-
print(manhattan_distance(xi, xj))
|
187
|
-
|
188
|
-
#定义函数:闵可夫斯基距离
|
189
|
-
def minkowski_distance(xi, xj, p):
|
190
|
-
sumval = sum(pow(abs(a - b), p) for a, b in zip(xi, xj))
|
191
|
-
mi = 1/ float(p)
|
192
|
-
distance = sumval ** mi
|
193
|
-
return distance
|
194
|
-
|
195
|
-
if __name__ =="__main__":
|
196
|
-
#示例:结果为3.332
|
197
|
-
xi = [1, 3, 2, 4]
|
198
|
-
xj = [2, 5, 3, 1]
|
199
|
-
print(minkowski_distance(xi, xj, 3))
|
200
|
-
|
201
|
-
#定义函数:切比雪夫距离,相当于空间曼哈顿距离
|
202
|
-
def chebyshev_distance(xi, xj):
|
203
|
-
distance = max(abs(a - b) for a, b in zip(xi, xj))
|
204
|
-
return distance
|
205
|
-
|
206
|
-
if __name__ =="__main__":
|
207
|
-
#示例:结果为3
|
208
|
-
xi = [1, 3, 2, 4]
|
209
|
-
xj = [2, 5, 3, 1]
|
210
|
-
print(chebyshev_distance(xi, xj))
|
211
|
-
|
212
|
-
#定义函数:余弦相似度
|
213
|
-
import numpy as np
|
214
|
-
def cosine_similarity(xi, xj):
|
215
|
-
numerator = sum(map(float, xi * xj))
|
216
|
-
#求向量(矩阵)的范数:np.linalg.norm
|
217
|
-
denominator = np.linalg.norm(xi) * np.linalg.norm(xj)
|
218
|
-
similarity = numerator / float(denominator)
|
219
|
-
return similarity
|
220
|
-
|
221
|
-
if __name__ =="__main__":
|
222
|
-
# 示例:结果为1,是相同的两个向量
|
223
|
-
xi = np.array([3, 4, 1, 5])
|
224
|
-
xj = np.array([3, 4, 1, 5])
|
225
|
-
print(cosine_similarity(xi, xj))
|
226
|
-
|
227
|
-
|
228
|
-
#距离的通用算法:pdist
|
229
|
-
def universal_distance(xi,xj,option='minkowski',p=3):
|
230
|
-
"""
|
231
|
-
功能:统一的距离算法
|
232
|
-
"""
|
233
|
-
#支持的距离选项
|
234
|
-
option_list = [
|
235
|
-
'braycurtis',
|
236
|
-
'canberra',
|
237
|
-
'chebyshev', #切比雪夫距离
|
238
|
-
'cityblock', #曼哈顿距离
|
239
|
-
'correlation',
|
240
|
-
'cosine', #余弦相似度
|
241
|
-
'dice',
|
242
|
-
'euclidean', #欧几里得距离
|
243
|
-
'hamming',
|
244
|
-
'jaccard',
|
245
|
-
'jensenshannon',
|
246
|
-
'kulsinski',
|
247
|
-
'mahalanobis', #著名的马氏距离
|
248
|
-
'matching',
|
249
|
-
'minkowski', #闵可夫斯基距离, 有p参数
|
250
|
-
'rogerstanimoto',
|
251
|
-
'russellrao',
|
252
|
-
'seuclidean', #标准化欧几里得距离
|
253
|
-
'sokalmichener',
|
254
|
-
'sokalsneath',
|
255
|
-
'sqeuclidean',
|
256
|
-
'yule']
|
257
|
-
if not (option in option_list):
|
258
|
-
print(" 不支持的距离测度方法:",option)
|
259
|
-
print(" 支持的距离测度方法:",option_list)
|
260
|
-
return
|
261
|
-
|
262
|
-
from scipy.spatial.distance import pdist
|
263
|
-
X=np.vstack([xi, xj])
|
264
|
-
distance=pdist(X, option, parm)
|
265
|
-
|
266
|
-
return distance
|
267
|
-
|
268
|
-
|
269
|
-
#==============================================================================
|
270
|
-
#==============================================================================
|
271
|
-
#==============================================================================
|
272
|
-
|
273
|
-
def get_stock_price(ticker,atdate,fromdate):
|
274
|
-
"""
|
275
|
-
功能:抓取股票收盘价
|
276
|
-
输出:指定股票的收盘价格序列,最新日期的股价排列在前
|
277
|
-
ticker:股票代码
|
278
|
-
atdate:当前日期,既可以是今天日期,也可以是一个历史日期,datetime类型
|
279
|
-
fromdate:样本开始日期,尽量远的日期,以便取得足够多的原始样本,类型同atdate
|
280
|
-
"""
|
281
|
-
#抓取股票价格
|
282
|
-
try:
|
283
|
-
price=get_price(ticker,fromdate,atdate)
|
284
|
-
except:
|
285
|
-
print(".Error(get_stock_price), failed to capture stock prices:",ticker,fromdate,atdate)
|
286
|
-
return None
|
287
|
-
#去掉比起始日期更早的样本
|
288
|
-
#price=price[price.index >= fromdate]
|
289
|
-
|
290
|
-
#按日期降序排序,近期的价格排在前面
|
291
|
-
sortedprice=price.sort_index(axis=0,ascending=False)
|
292
|
-
|
293
|
-
#提取日期和星期几
|
294
|
-
sortedprice['Date']=sortedprice.index.strftime("%Y-%m-%d")
|
295
|
-
sortedprice['Weekday']=sortedprice.index.weekday+1
|
296
|
-
|
297
|
-
#生成输出数据格式:日期,星期几,收盘价
|
298
|
-
dfprice=sortedprice[['Date','Weekday','Close']]
|
299
|
-
|
300
|
-
return dfprice
|
301
|
-
|
302
|
-
|
303
|
-
if __name__=='__main__':
|
304
|
-
ticker='MSFT'
|
305
|
-
atdate='4/2/2020'
|
306
|
-
fromdate='1/1/2015'
|
307
|
-
dfprice=get_stock_price('MSFT','4/2/2020','1/1/2015')
|
308
|
-
dfprice.head(5)
|
309
|
-
dfprice.tail(3)
|
310
|
-
dfprice[dfprice.Date == '2019-03-29']
|
311
|
-
dfprice[(dfprice.Date>='2019-03-20') & (dfprice.Date<='2019-03-29')]
|
312
|
-
|
313
|
-
|
314
|
-
#==============================================================================
|
315
|
-
def make_price_sample(dfprice,n_nextdays=1,n_samples=252,n_features=21):
|
316
|
-
"""
|
317
|
-
功能:生成指定股票的价格样本
|
318
|
-
ticker:股票代码
|
319
|
-
n_nextdays:预测从atdate开始未来第几天的股价,默认为1
|
320
|
-
n_samples:需要生成的样本个数,默认252个(一年的平均交易天数)
|
321
|
-
n_features:使用的特征数量,默认21个(一个月的平均交易天数)
|
322
|
-
"""
|
323
|
-
#检查样本数量是否够用
|
324
|
-
n_req=n_features+n_nextdays+n_samples
|
325
|
-
if len(dfprice) < n_req:
|
326
|
-
print(".Error(make_price_sample), need more number of stock prices!")
|
327
|
-
print("...There are only",len(dfprice),"obs in the stock price file")
|
328
|
-
print("...But, I need at least",n_req,"obs to make ML samples")
|
329
|
-
return None,None,None
|
330
|
-
|
331
|
-
#提取收盘价,Series类型
|
332
|
-
closeprice=dfprice.Close
|
333
|
-
|
334
|
-
#转换为机器学习需要的ndarray类型
|
335
|
-
import numpy as np
|
336
|
-
ndprice=np.asmatrix(closeprice,dtype=None)
|
337
|
-
|
338
|
-
#生成第一个标签样本:标签矩阵y(形状:n_samples x 1)
|
339
|
-
y=np.asmatrix(ndprice[0,0])
|
340
|
-
#生成第一个特征样本:特征矩阵X(形状:n_samples x n_features)
|
341
|
-
X=ndprice[0,n_nextdays:n_features+n_nextdays]
|
342
|
-
|
343
|
-
#生成其余的标签样本和特征样本
|
344
|
-
for i in range(1,n_samples):
|
345
|
-
#加入到标签矩阵中
|
346
|
-
y_row=np.asmatrix(ndprice[0,i])
|
347
|
-
y=np.append(y,y_row,axis=0)
|
348
|
-
#加入到特征矩阵中
|
349
|
-
X_row=ndprice[0,(n_nextdays+i):(n_features+n_nextdays+i)]
|
350
|
-
X=np.append(X,X_row,axis=0)
|
351
|
-
|
352
|
-
return X,y,ndprice
|
353
|
-
|
354
|
-
if __name__=='__main__':
|
355
|
-
dfprice=get_stock_price('LK','4/3/2020','1/1/2015')
|
356
|
-
X,y,ndprice=make_price_sample(dfprice,1,200,21)
|
357
|
-
y[:5]
|
358
|
-
y[2:5] #第1行的序号为0
|
359
|
-
X[:5]
|
360
|
-
X[:-5]
|
361
|
-
X[3-1,2-1]
|
362
|
-
|
363
|
-
|
364
|
-
#==============================================================================
|
365
|
-
def bestKN(X,y,maxk=10,random_state=0):
|
366
|
-
"""
|
367
|
-
功能:给定特征矩阵和标签,返回最优的邻居个数(默认最大为10)和模型
|
368
|
-
最优策略:测试集分数最高,不管过拟合问题
|
369
|
-
"""
|
370
|
-
#随机分割样本为训练集和测试集
|
371
|
-
from sklearn.model_selection import train_test_split
|
372
|
-
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=random_state)
|
373
|
-
|
374
|
-
#引用k近邻模型的预测器(Regressor)
|
375
|
-
from sklearn.neighbors import KNeighborsRegressor
|
376
|
-
bestk=1
|
377
|
-
reg=KNeighborsRegressor(n_neighbors=bestk,weights='distance',n_jobs=-1)
|
378
|
-
reg.fit(X_train,y_train)
|
379
|
-
bestmodel=reg
|
380
|
-
bestscore_train=bestmodel.score(X_train,y_train)
|
381
|
-
bestscore_test =bestmodel.score(X_test,y_test)
|
382
|
-
|
383
|
-
for k in range(2,(maxk+1)):
|
384
|
-
reg=KNeighborsRegressor(n_neighbors=k,weights='distance',n_jobs=-1)
|
385
|
-
reg.fit(X_train,y_train)
|
386
|
-
score_train=reg.score(X_train,y_train)
|
387
|
-
score_test =reg.score(X_test,y_test)
|
388
|
-
|
389
|
-
if score_test > bestscore_test:
|
390
|
-
bestk=k
|
391
|
-
bestscore_train=score_train
|
392
|
-
bestscore_test =score_test
|
393
|
-
bestmodel=reg
|
394
|
-
|
395
|
-
return bestmodel,bestk,bestscore_train,bestscore_test
|
396
|
-
|
397
|
-
if __name__=='__main__':
|
398
|
-
dfprice=get_stock_price('MSFT','4/3/2019','1/1/2015')
|
399
|
-
X,y,ndprice=make_price_sample(dfprice,1,240,20)
|
400
|
-
bestmodel,bestk,bestscore_train,bestscore_test=bestKN(X,y)
|
401
|
-
print(bestk,bestscore_train,bestscore_test)
|
402
|
-
|
403
|
-
|
404
|
-
#==============================================================================
|
405
|
-
def bestFN(dfprice,n_nextdays=1,n_samples=252,maxFN=252,random_state=0):
|
406
|
-
"""
|
407
|
-
功能:给定股价序列,试验最优的特征个数(默认最大为60)和模型
|
408
|
-
最优策略:测试集分数最高,不管过拟合问题
|
409
|
-
"""
|
410
|
-
#试验起点:1个特征个数
|
411
|
-
bestf=1
|
412
|
-
X,y,ndprice=make_price_sample(dfprice,n_nextdays,n_samples,bestf)
|
413
|
-
#测试给定特征个数时的最优邻居个数
|
414
|
-
bestmodel,bestk,bestscore_train,bestscore_test=bestKN(X,y,random_state=random_state)
|
415
|
-
|
416
|
-
#特征个数增长的步长
|
417
|
-
n_step=1
|
418
|
-
for f in range(2,maxFN+1,n_step):
|
419
|
-
if len(dfprice) < (n_nextdays+n_samples+f): break
|
420
|
-
X,y,ndprice=make_price_sample(dfprice,n_nextdays,n_samples,f)
|
421
|
-
model,k,score_train,score_test=bestKN(X,y,random_state=random_state)
|
422
|
-
|
423
|
-
if score_test > bestscore_test:
|
424
|
-
bestf=f
|
425
|
-
bestk=k
|
426
|
-
bestscore_train=score_train
|
427
|
-
bestscore_test =score_test
|
428
|
-
bestmodel=model
|
429
|
-
|
430
|
-
#返回测试集效果最好的模型、特征个数、邻居个数、成绩
|
431
|
-
return bestmodel,bestf,bestk,bestscore_train,bestscore_test
|
432
|
-
|
433
|
-
if __name__=='__main__':
|
434
|
-
dfprice=get_stock_price('MSFT','4/4/2020','1/1/2015')
|
435
|
-
bestmodel,bestf,bestk,bestscore_train,bestscore_test= \
|
436
|
-
bestFN(dfprice,1,252)
|
437
|
-
|
438
|
-
print("best f=",bestf,",best k=",bestk, \
|
439
|
-
"\nbest score on train=",bestscore_train, \
|
440
|
-
"\nbest score on test=",bestscore_test)
|
441
|
-
|
442
|
-
|
443
|
-
#==============================================================================
|
444
|
-
def bestKN2(X,y,maxk=10,random_state=0):
|
445
|
-
"""
|
446
|
-
功能:给定特征矩阵和标签,返回最优的邻居个数(默认最大为10)和模型
|
447
|
-
最优策略2:训练集和测试集分数最接近,希望控制过拟合和欠拟合问题
|
448
|
-
"""
|
449
|
-
#随机分割样本为训练集和测试集
|
450
|
-
from sklearn.model_selection import train_test_split
|
451
|
-
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=random_state)
|
452
|
-
|
453
|
-
#引用k近邻模型的预测器(Regressor)
|
454
|
-
from sklearn.neighbors import KNeighborsRegressor
|
455
|
-
bestk=1
|
456
|
-
reg=KNeighborsRegressor(n_neighbors=bestk,weights='distance',n_jobs=-1)
|
457
|
-
reg.fit(X_train,y_train)
|
458
|
-
bestmodel=reg
|
459
|
-
bestscore_train=reg.score(X_train,y_train)
|
460
|
-
bestscore_test =reg.score(X_test,y_test)
|
461
|
-
|
462
|
-
import numpy as np
|
463
|
-
bestrate=np.abs(bestscore_train / bestscore_test -1)
|
464
|
-
|
465
|
-
for k in range(2,(maxk+1)):
|
466
|
-
reg=KNeighborsRegressor(n_neighbors=k,weights='distance',n_jobs=-1)
|
467
|
-
reg.fit(X_train,y_train)
|
468
|
-
score_train=reg.score(X_train,y_train)
|
469
|
-
score_test =reg.score(X_test,y_test)
|
470
|
-
rate=np.abs(score_train / score_test -1)
|
471
|
-
|
472
|
-
if rate < bestrate:
|
473
|
-
bestk=k
|
474
|
-
bestrate=rate
|
475
|
-
bestscore_train=score_train
|
476
|
-
bestscore_test =score_test
|
477
|
-
bestmodel=reg
|
478
|
-
return bestmodel,bestk,bestscore_train,bestscore_test,bestrate
|
479
|
-
|
480
|
-
if __name__=='__main__':
|
481
|
-
dfprice=get_stock_price('MSFT','3/27/2019','1/1/2015')
|
482
|
-
X,y,ndprice=make_price_sample(dfprice,1,252,21)
|
483
|
-
|
484
|
-
bestmodel,bestk,bestscore_train,bestscore_test=bestKN(X,y)
|
485
|
-
print("best k=",bestk,"\nbest score on train=",bestscore_train, \
|
486
|
-
",best score on test=",bestscore_test)
|
487
|
-
|
488
|
-
bestmodel,bestk,bestscore_train,bestscore_test,bestrate=bestKN2(X,y)
|
489
|
-
print("best k=",bestk,"\nbest score on train=",bestscore_train, \
|
490
|
-
",best score on test=",bestscore_test)
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
#==============================================================================
|
495
|
-
def bestFN2(dfprice,n_nextdays=1,n_samples=252,maxFN=252,random_state=0):
|
496
|
-
"""
|
497
|
-
功能:给定股价序列,试验最优的特征个数(默认最大为252)和模型
|
498
|
-
最优策略2:训练集和测试集分数最接近,希望控制过拟合和欠拟合问题
|
499
|
-
"""
|
500
|
-
#试验起点:1个特征个数
|
501
|
-
bestf=1
|
502
|
-
X,y,ndprice=make_price_sample(dfprice,n_nextdays,n_samples,bestf)
|
503
|
-
#测试给定特征个数时的最优邻居个数
|
504
|
-
bestmodel,bestk,bestscore_train,bestscore_test,bestrate=bestKN2(X,y,random_state=random_state)
|
505
|
-
|
506
|
-
#特征个数增长的步长
|
507
|
-
n_step=1
|
508
|
-
for f in range(2,maxFN+1,n_step):
|
509
|
-
if len(dfprice) < (n_nextdays+n_samples+f): break
|
510
|
-
X,y,ndprice=make_price_sample(dfprice,n_nextdays,n_samples,f)
|
511
|
-
model,k,score_train,score_test,rate=bestKN2(X,y,random_state=random_state)
|
512
|
-
|
513
|
-
if rate < bestrate:
|
514
|
-
bestf=f
|
515
|
-
bestk=k
|
516
|
-
bestscore_train=score_train
|
517
|
-
bestscore_test =score_test
|
518
|
-
bestrate=rate
|
519
|
-
bestmodel=model
|
520
|
-
|
521
|
-
#返回测试集效果最好的模型、特征个数、邻居个数、成绩
|
522
|
-
return bestmodel,bestf,bestk,bestscore_train,bestscore_test,bestrate
|
523
|
-
|
524
|
-
if __name__=='__main__':
|
525
|
-
dfprice=get_stock_price('MSFT','3/27/2019','1/1/2015')
|
526
|
-
bestmodel,bestf,bestk,bestscore_train,bestscore_test= \
|
527
|
-
bestFN(dfprice,1,252)
|
528
|
-
print("best f=",bestf,",best k=",bestk, \
|
529
|
-
"\nbest score on train=",bestscore_train, \
|
530
|
-
"\nbest score on test=",bestscore_test)
|
531
|
-
|
532
|
-
bestmodel,bestf,bestk,bestscore_train,bestscore_test= \
|
533
|
-
bestFN2(ndprice,1,252)
|
534
|
-
print("best f=",bestf,",best k=",bestk, \
|
535
|
-
"\nbest score on train=",bestscore_train, \
|
536
|
-
"\nbest score on test=",bestscore_test)
|
537
|
-
|
538
|
-
#==============================================================================
|
539
|
-
def isdate(adate):
|
540
|
-
"""
|
541
|
-
功能:根据日期的合理性
|
542
|
-
输入参数:
|
543
|
-
adate:日期。格式:YYYY-MM-DD
|
544
|
-
输出:无
|
545
|
-
返回:有效/无效日期(True/False)
|
546
|
-
"""
|
547
|
-
import pandas as pd
|
548
|
-
#测试开始日期的合理性
|
549
|
-
try: adatedt=pd.to_datetime(adate)
|
550
|
-
except: return False
|
551
|
-
else: return True
|
552
|
-
|
553
|
-
#==============================================================================
|
554
|
-
def date_adjust(basedate, adjust=0):
|
555
|
-
"""
|
556
|
-
功能:将给定日期向前或向后调整特定的天数
|
557
|
-
输入:基础日期,需要调整的天数。
|
558
|
-
basedate: 基础日期。
|
559
|
-
adjust:需要调整的天数,负数表示向前调整,正数表示向后调整。
|
560
|
-
输出:调整后的日期。
|
561
|
-
"""
|
562
|
-
#检查基础日期的合理性
|
563
|
-
import pandas as pd
|
564
|
-
try:
|
565
|
-
bd=pd.to_datetime(basedate)
|
566
|
-
except:
|
567
|
-
print("*** 错误#1(date_adjust),无效的日期:",basedate)
|
568
|
-
return None
|
569
|
-
|
570
|
-
#调整日期
|
571
|
-
from datetime import timedelta
|
572
|
-
nd = bd+timedelta(days=adjust)
|
573
|
-
|
574
|
-
#重新提取日期
|
575
|
-
newdate=nd.date()
|
576
|
-
return str(newdate)
|
577
|
-
|
578
|
-
if __name__ =="__main__":
|
579
|
-
basedate='2020-3-17'
|
580
|
-
adjust=-365
|
581
|
-
newdate = date_adjust(basedate, adjust)
|
582
|
-
print(newdate)
|
583
|
-
|
584
|
-
#==============================================================================
|
585
|
-
def forecast_stock_price(ticker,atdate,n_nextdays,n_samples=252, \
|
586
|
-
maxk=20,maxFN=252,random_state=0,printout=True):
|
587
|
-
"""
|
588
|
-
功能:预测未来第几天的股票收盘价,执行FN和FN2优化策略
|
589
|
-
"""
|
590
|
-
#检查日期的合理性
|
591
|
-
if not isdate(atdate):
|
592
|
-
print(".Error(forecast_stock_price), invalid date:",atdate)
|
593
|
-
return None
|
594
|
-
|
595
|
-
print("..Predicting stock price, it may take long time, please wait ......")
|
596
|
-
|
597
|
-
#设定起始日期:
|
598
|
-
nyears=int((n_nextdays + n_samples + maxFN + 1)/252)+2
|
599
|
-
start=date_adjust(atdate,-366*nyears)
|
600
|
-
|
601
|
-
#抓取股价数据
|
602
|
-
dfprice=get_stock_price(ticker,atdate,start)
|
603
|
-
if dfprice is None:
|
604
|
-
print(".Error(forecast_stock_price), failed to capture stock prices:",ticker)
|
605
|
-
return None
|
606
|
-
if len(dfprice) < (n_nextdays + n_samples + maxFN + 1):
|
607
|
-
print(".Error(forecast_stock_price), insufficient number of stock prices!")
|
608
|
-
return None
|
609
|
-
|
610
|
-
#生成机器学习样本1: 确定最佳特征个数bestf,不管过拟合/欠拟合问题
|
611
|
-
bestmodel1,bestf1,bestk1,bestscore_train1,bestscore_test1= \
|
612
|
-
bestFN(dfprice,n_nextdays,n_samples,random_state=random_state)
|
613
|
-
X,y,ndprice=make_price_sample(dfprice,n_nextdays,n_samples,bestf1)
|
614
|
-
|
615
|
-
#基于最新特征样本X_new,预测第n_nextdays的股价
|
616
|
-
X_new1=ndprice[0,0:bestf1]
|
617
|
-
y_new1=bestmodel1.predict(X_new1)
|
618
|
-
|
619
|
-
|
620
|
-
#生成机器学习样本2: 确定最佳特征个数bestf,考虑过拟合/欠拟合问题
|
621
|
-
bestmodel2,bestf2,bestk2,bestscore_train2,bestscore_test2,bestrate2= \
|
622
|
-
bestFN2(dfprice,n_nextdays,n_samples)
|
623
|
-
X,y,ndprice=make_price_sample(dfprice,n_nextdays,n_samples,bestf2)
|
624
|
-
X_new2=ndprice[0,0:bestf2]
|
625
|
-
y_new2=bestmodel2.predict(X_new2)
|
626
|
-
|
627
|
-
|
628
|
-
#最终决定:以最大测试成绩为优先
|
629
|
-
if bestscore_test1 <= bestscore_test2:
|
630
|
-
predicted_y=y_new2[0,0]
|
631
|
-
bestscore_train=bestscore_train2
|
632
|
-
bestscore_test=bestscore_test2
|
633
|
-
bestfeature=bestf2
|
634
|
-
bestk=bestk2
|
635
|
-
else:
|
636
|
-
predicted_y=y_new1[0,0]
|
637
|
-
bestscore_train=bestscore_train1
|
638
|
-
bestscore_test=bestscore_test1
|
639
|
-
bestfeature=bestf1
|
640
|
-
bestk=bestk1
|
641
|
-
if printout:
|
642
|
-
print(" Forecasted price:%10.2f" % predicted_y)
|
643
|
-
print(" Best score on train:",round(bestscore_train,4))
|
644
|
-
print(" Best score on test:",round(bestscore_test,4))
|
645
|
-
print(" Best number of features:",bestfeature)
|
646
|
-
print(" Best number of neighbors:",bestk)
|
647
|
-
|
648
|
-
return predicted_y,bestscore_train,bestscore_test,bestfeature,bestk
|
649
|
-
|
650
|
-
|
651
|
-
if __name__ =="__main__":
|
652
|
-
ticker='MSFT'
|
653
|
-
atdate="2020-4-2"
|
654
|
-
n_nextdays=1
|
655
|
-
info=forecast_stock_price(ticker,atdate,n_nextdays)
|
656
|
-
print(info)
|
657
|
-
|
658
|
-
#==============================================================================
|
659
|
-
def forecast_stock_price2(dfprice,n_nextdays,n_samples=252, \
|
660
|
-
maxk=20,maxFN=252,random_state=0):
|
661
|
-
"""
|
662
|
-
功能:预测未来第几天的股票收盘价,执行FN和FN2优化策略,单一随机数种子
|
663
|
-
"""
|
664
|
-
#生成机器学习样本1: 确定最佳特征个数bestf,不管过拟合/欠拟合问题
|
665
|
-
bestmodel1,bestf1,bestk1,bestscore_train1,bestscore_test1= \
|
666
|
-
bestFN(dfprice,n_nextdays,n_samples,random_state=random_state)
|
667
|
-
X,y,ndprice=make_price_sample(dfprice,n_nextdays,n_samples,bestf1)
|
668
|
-
|
669
|
-
#基于最新特征样本X_new,预测第n_nextdays的股价
|
670
|
-
X_new1=ndprice[0,0:bestf1]
|
671
|
-
y_new1=bestmodel1.predict(X_new1)
|
672
|
-
|
673
|
-
|
674
|
-
#生成机器学习样本2: 确定最佳特征个数bestf,考虑过拟合/欠拟合问题
|
675
|
-
bestmodel2,bestf2,bestk2,bestscore_train2,bestscore_test2,bestrate2= \
|
676
|
-
bestFN2(dfprice,n_nextdays,n_samples)
|
677
|
-
X,y,ndprice=make_price_sample(dfprice,n_nextdays,n_samples,bestf2)
|
678
|
-
X_new2=ndprice[0,0:bestf2]
|
679
|
-
y_new2=bestmodel2.predict(X_new2)
|
680
|
-
|
681
|
-
#最终决定:以最大测试成绩为优先
|
682
|
-
if bestscore_test1 <= bestscore_test2:
|
683
|
-
predicted_y=y_new2[0,0]
|
684
|
-
bestscore_train=bestscore_train2
|
685
|
-
bestscore_test=bestscore_test2
|
686
|
-
bestfeature=bestf2
|
687
|
-
bestk=bestk2
|
688
|
-
else:
|
689
|
-
predicted_y=y_new1[0,0]
|
690
|
-
bestscore_train=bestscore_train1
|
691
|
-
bestscore_test=bestscore_test1
|
692
|
-
bestfeature=bestf1
|
693
|
-
bestk=bestk1
|
694
|
-
|
695
|
-
return round(predicted_y,2),round(bestscore_train,4), \
|
696
|
-
round(bestscore_test,4),bestfeature,bestk
|
697
|
-
|
698
|
-
|
699
|
-
if __name__ =="__main__":
|
700
|
-
ticker='MSFT'
|
701
|
-
atdate="2020-4-2"
|
702
|
-
n_nextdays=1
|
703
|
-
dfprice=get_stock_price('MSFT','4/2/2020','1/1/2015')
|
704
|
-
info=forecast_stock_price2(dfprice,n_nextdays)
|
705
|
-
print(info)
|
706
|
-
|
707
|
-
#==============================================================================
|
708
|
-
def weighted_median(df,colname,colweight):
|
709
|
-
"""
|
710
|
-
功能:求加权中位数
|
711
|
-
输入:数据表df, 需要求中位数的列名colname, 权重所在的列名colweight
|
712
|
-
返回:50%中位数数值
|
713
|
-
"""
|
714
|
-
from statsmodels.stats.weightstats import DescrStatsW
|
715
|
-
wdf = DescrStatsW(df[colname], weights=df[colweight], ddof=1)
|
716
|
-
|
717
|
-
if len(df) >= 3:
|
718
|
-
wmedianlist=list(wdf.quantile([0.50]))
|
719
|
-
wmedian=wmedianlist[0]
|
720
|
-
elif len(df) == 2:
|
721
|
-
wmedian=(df[colname][0]*df[colweight][0]+df[colname][1]*df[colweight][1])/(df[colweight][0]+df[colweight][1])
|
722
|
-
elif len(df) == 1:
|
723
|
-
wmedian=df[colname][0]
|
724
|
-
else:
|
725
|
-
return None
|
726
|
-
|
727
|
-
return wmedian
|
728
|
-
|
729
|
-
if __name__ =="__main__":
|
730
|
-
import pandas as pd
|
731
|
-
df=pd.DataFrame({ 'x':range(1,3), 'wt':range(1,3) })
|
732
|
-
colname='x'
|
733
|
-
colweight='wt'
|
734
|
-
weighted_median(df,colname,colweight)
|
735
|
-
|
736
|
-
#==============================================================================
|
737
|
-
def second2time(seconds):
|
738
|
-
"""
|
739
|
-
功能:将秒数转换为时分秒
|
740
|
-
输入:秒数
|
741
|
-
返回:时分秒,字符串
|
742
|
-
"""
|
743
|
-
hours=int(seconds/3600)
|
744
|
-
minutes=int((seconds-hours*3600)/60)
|
745
|
-
|
746
|
-
if seconds >= 60:
|
747
|
-
decm=1
|
748
|
-
elif seconds >= 10:
|
749
|
-
decm=1
|
750
|
-
elif seconds >= 0.1:
|
751
|
-
decm=2
|
752
|
-
else:
|
753
|
-
decm=4
|
754
|
-
miaos=round(seconds-hours*3600-minutes*60,decm)
|
755
|
-
timestr=str(hours)+":"+str(minutes)+":"+str(miaos)
|
756
|
-
|
757
|
-
return timestr
|
758
|
-
|
759
|
-
if __name__ =="__main__":
|
760
|
-
second2time(590.58963)
|
761
|
-
second2time(65.456321)
|
762
|
-
second2time(35.75698)
|
763
|
-
second2time(5.75698)
|
764
|
-
second2time(0.75698)
|
765
|
-
second2time(0.00098)
|
766
|
-
#==============================================================================
|
767
|
-
def save_to_excel(df,excelfile="myfile01.xlsx",sheetname="Sheet1"):
|
768
|
-
"""
|
769
|
-
函数功能:将df保存到当前目录下的Excel文件。
|
770
|
-
如果未指定Excel文件则默认为"myfile.xls"
|
771
|
-
如果Excel文件不存在则创建文件并保存到指定的sheetname;如果未指定sheetname则默
|
772
|
-
认为"First"
|
773
|
-
如果Excel文件存在但sheetname不存在则增加sheetname并保存df内容,原有sheet内容
|
774
|
-
不变;
|
775
|
-
如果Excel文件和sheetname都存在则追加df内容到已有sheet的末尾
|
776
|
-
输入参数:
|
777
|
-
df: 数据框
|
778
|
-
excelfile: Excel文件名,不带目录,后缀为.xls或.xlsx
|
779
|
-
sheetname:Excel文件中的sheet名
|
780
|
-
输出:
|
781
|
-
保存df到Excel文件
|
782
|
-
无返回数据
|
783
|
-
注意:如果df中含有以文本表示的数字,写入到Excel会被自动转换为数字类型保存。
|
784
|
-
从Excel中读出后为数字类型,因此将会与df的类型不一致
|
785
|
-
"""
|
786
|
-
#取得df字段列表
|
787
|
-
dflist=list(df)
|
788
|
-
#合成完整的带目录的文件名
|
789
|
-
filename=excelfile
|
790
|
-
|
791
|
-
import pandas as pd
|
792
|
-
try:
|
793
|
-
file1=pd.ExcelFile(excelfile)
|
794
|
-
except:
|
795
|
-
#不存在excelfile文件,直接写入
|
796
|
-
df.to_excel(filename,sheet_name=sheetname, \
|
797
|
-
header=True,encoding='utf-8')
|
798
|
-
print("*** Results saved in",filename,"@ sheet",sheetname)
|
799
|
-
return
|
800
|
-
else:
|
801
|
-
#已存在excelfile文件,先将所有sheet的内容读出到dict中
|
802
|
-
dict=pd.read_excel(file1, None)
|
803
|
-
file1.close()
|
804
|
-
|
805
|
-
#获得所有sheet名字
|
806
|
-
sheetlist=list(dict.keys())
|
807
|
-
#检查新的sheet名字是否已存在
|
808
|
-
try:
|
809
|
-
pos=sheetlist.index(sheetname)
|
810
|
-
except:
|
811
|
-
#不存在重复
|
812
|
-
dup=False
|
813
|
-
else:
|
814
|
-
#存在重复,合并内容
|
815
|
-
dup=True
|
816
|
-
#合并之前可能需要对df中以字符串表示的数字字段进行强制类型转换.astype('int')
|
817
|
-
df1=dict[sheetlist[pos]][dflist]
|
818
|
-
dfnew=pd.concat([df1,df],axis=0,ignore_index=True)
|
819
|
-
dict[sheetlist[pos]]=dfnew
|
820
|
-
|
821
|
-
#将原有内容写回excelfile
|
822
|
-
result=pd.ExcelWriter(filename)
|
823
|
-
for s in sheetlist:
|
824
|
-
df1=dict[s][dflist]
|
825
|
-
df1.to_excel(result,s,header=True,index=True,encoding='utf-8')
|
826
|
-
#写入新内容
|
827
|
-
if not dup: #sheetname未重复
|
828
|
-
df.to_excel(result,sheetname,header=True,index=True,encoding='utf-8')
|
829
|
-
try:
|
830
|
-
result.save()
|
831
|
-
result.close()
|
832
|
-
except:
|
833
|
-
print("... Error(save_to_excel): writing file failed",filename,"@ sheet",sheetname)
|
834
|
-
print("Information:",filename)
|
835
|
-
return
|
836
|
-
print("*** Results saved in",filename,"@ sheet",sheetname)
|
837
|
-
|
838
|
-
return
|
839
|
-
|
840
|
-
|
841
|
-
#==============================================================================
|
842
|
-
def forecast_stock_price_rs(ticker,atdate,n_nextdays=1,n_samples=252, \
|
843
|
-
maxk=20,maxFN=252,random_state=0,maxRS=9, \
|
844
|
-
excelfile="myfile01.xlsx",sheetname="Sheet1"):
|
845
|
-
"""
|
846
|
-
功能:预测未来第几天的股票收盘价,试验随机数种子策略
|
847
|
-
输入参数:
|
848
|
-
1、ticker: 股票代码
|
849
|
-
2、atdate: 当前日期,可以是今天或以前的一天
|
850
|
-
3、n_nextdays: 以atdate为基准向前推进几个交易日,预测该日期的股价
|
851
|
-
4、n_samples: 生成机器学习用的样本中的最大观察数目。
|
852
|
-
跨年的样本有助于模型学习季节性效应,3年的样本效果好于2年,
|
853
|
-
2年的样本效果好于1年
|
854
|
-
5、maxk:试探的最大邻居个数
|
855
|
-
6、maxFN:试探的最大特征个数
|
856
|
-
7、random_state: 开始试探时的随机数种子
|
857
|
-
8、maxRS: 用于试探的最大的随机数种子
|
858
|
-
9、excelfile:保存文件的名字
|
859
|
-
10、sheetname:Excel文件的sheet名字
|
860
|
-
输出:每次迭代取得更好的测试集分数时,输出模型参数和预测的股价
|
861
|
-
返回:最优测试集的模型参数及预测的股价,以及各个迭代最优结果下预测的股价的
|
862
|
-
加权中位数,权重为各个测试集分数。
|
863
|
-
"""
|
864
|
-
#检查日期的合理性
|
865
|
-
if not isdate(atdate):
|
866
|
-
print(".Error(forecast_stock_price_rs), invalid date:",atdate)
|
867
|
-
return None
|
868
|
-
|
869
|
-
#开始计时
|
870
|
-
print("\n... Predicting stock price, it may take very long time, please wait ......")
|
871
|
-
import time
|
872
|
-
time0 = time.perf_counter()
|
873
|
-
|
874
|
-
#设定起始日期:
|
875
|
-
nyears=int((n_nextdays + n_samples + maxFN + 1)/252)+2
|
876
|
-
start=date_adjust(atdate,-366*nyears)
|
877
|
-
|
878
|
-
#抓取股价数据
|
879
|
-
dfprice=get_stock_price(ticker,atdate,start)
|
880
|
-
if dfprice is None:
|
881
|
-
print(".Error(forecast_stock_price_rs), failed to capture stock prices:",ticker)
|
882
|
-
return None
|
883
|
-
if len(dfprice) < (n_nextdays + n_samples + maxFN + 1):
|
884
|
-
print(".Error(forecast_stock_price_rs), insufficient number of stock prices!")
|
885
|
-
return None
|
886
|
-
|
887
|
-
#设置测试集分数起点
|
888
|
-
bestscore_test=0.0
|
889
|
-
#建立结果表结构
|
890
|
-
import pandas as pd
|
891
|
-
result=pd.DataFrame(columns=('ticker','atdate','n_nextdays','n_samples', \
|
892
|
-
'random_state','pred_y','bestscore_train', \
|
893
|
-
'bestscore_test','bestfeature','bestk'))
|
894
|
-
#倒序随机数种子,便于尽快看到最优结果
|
895
|
-
rslist=list(range(random_state,maxRS+1))
|
896
|
-
rslist.reverse()
|
897
|
-
#开始逐一试探各个随机数种子的最佳分数
|
898
|
-
for rs in rslist:
|
899
|
-
print("... Testing random seed:",rs)
|
900
|
-
pred_y0,bestscore_train0,bestscore_test0,bestfeature0,bestk0= \
|
901
|
-
forecast_stock_price2(dfprice,n_nextdays=n_nextdays, \
|
902
|
-
n_samples=n_samples,maxk=maxk, \
|
903
|
-
maxFN=maxFN,random_state=rs)
|
904
|
-
|
905
|
-
#记录中间结果
|
906
|
-
row=pd.Series({'ticker':ticker,'atdate':atdate,'n_nextdays':n_nextdays, \
|
907
|
-
'n_samples':n_samples,'random_state':rs,'pred_y':pred_y0, \
|
908
|
-
'bestscore_train':bestscore_train0,'bestscore_test':bestscore_test0, \
|
909
|
-
'bestfeature':bestfeature0,'bestk':bestk0})
|
910
|
-
result=result.append(row,ignore_index=True)
|
911
|
-
|
912
|
-
#更新最佳纪录
|
913
|
-
if bestscore_test < bestscore_test0:
|
914
|
-
pred_y=pred_y0
|
915
|
-
bestscore_train=bestscore_train0
|
916
|
-
bestscore_test=bestscore_test0
|
917
|
-
bestfeature=bestfeature0
|
918
|
-
bestk=bestk0
|
919
|
-
|
920
|
-
print(" Predicted stock price :",pred_y)
|
921
|
-
print(" Best score on train :",bestscore_train)
|
922
|
-
print(" Best score on test :",bestscore_test)
|
923
|
-
print(" Best number of features :",bestfeature)
|
924
|
-
print(" Best number of neighbors:",bestk,"\n")
|
925
|
-
|
926
|
-
#再度显示中间结果
|
927
|
-
pd.set_option('display.unicode.ambiguous_as_wide', True)
|
928
|
-
pd.set_option('display.unicode.east_asian_width', True)
|
929
|
-
pd.set_option('display.width', 180) # 设置打印宽度(**重要**)
|
930
|
-
print("... Summary:")
|
931
|
-
print(result.to_string(index=False))
|
932
|
-
print("\n... Result by highest score on test:",result['pred_y'][-1].values[0])
|
933
|
-
|
934
|
-
#计算运行时间
|
935
|
-
time1 = time.perf_counter()
|
936
|
-
elapsed=time1 - time0
|
937
|
-
print("... Total elapsed time is",second2time(elapsed))
|
938
|
-
|
939
|
-
save_to_excel(result,excelfile,sheetname)
|
940
|
-
print("... Results saved in an Excel file:",excelfile,"@sheet",sheetname)
|
941
|
-
|
942
|
-
return result
|
943
|
-
|
944
|
-
if __name__ =="__main__":
|
945
|
-
ticker='MSFT'
|
946
|
-
atdate="2020-4-5"
|
947
|
-
n_nextdays=1
|
948
|
-
maxRS=1
|
949
|
-
info=forecast_stock_price_rs(ticker,atdate,n_nextdays,maxRS=maxRS)
|
950
|
-
print(info.to_string(index=False))
|
951
|
-
|
952
|
-
#==============================================================================
|
953
|
-
def multisummary(result,notes='',top=5):
|
954
|
-
"""
|
955
|
-
功能:计算其加权中位数
|
956
|
-
输入参数:
|
957
|
-
1、result: 各个随机数种子下的最优预测值
|
958
|
-
2、top: 采用测试分数最高的几个结果参加加权中位数计算
|
959
|
-
输出:加权中位数
|
960
|
-
返回:预测的股价的加权中位数,权重为各个测试集分数。
|
961
|
-
"""
|
962
|
-
|
963
|
-
#检查文件是否为空
|
964
|
-
if len(result)==0:
|
965
|
-
print("... Error(multisummary), No data recieved!")
|
966
|
-
return None
|
967
|
-
|
968
|
-
#排序: 升序
|
969
|
-
result.sort_values(by=["bestscore_test","bestfeature"],ascending=[True,True],inplace=True)
|
970
|
-
|
971
|
-
#对预测的股价取加权中位数
|
972
|
-
if len(result) < top: top=len(result)
|
973
|
-
topdata=result.tail(top)
|
974
|
-
pred_y_wmedian=round(weighted_median(topdata,'pred_y','bestscore_test'),2)
|
975
|
-
|
976
|
-
#显示详细结果
|
977
|
-
import pandas as pd
|
978
|
-
pd.set_option('display.unicode.ambiguous_as_wide', True)
|
979
|
-
pd.set_option('display.unicode.east_asian_width', True)
|
980
|
-
pd.set_option('display.width', 180) # 设置打印宽度(**重要**)
|
981
|
-
|
982
|
-
print("\n... Summary:",notes)
|
983
|
-
print(result.to_string(index=False))
|
984
|
-
hsotest=round(result.tail(1)['pred_y'].values[0],2)
|
985
|
-
if notes == 'final':
|
986
|
-
print("\n... Predicted price by highest score on test:",hsotest)
|
987
|
-
print("... Predicted in median weighted by score on test:",pred_y_wmedian)
|
988
|
-
|
989
|
-
return hsotest,pred_y_wmedian
|
990
|
-
|
991
|
-
if __name__ =="__main__":
|
992
|
-
wmprice=multisummary(result,top=5)
|
993
|
-
|
994
|
-
#==============================================================================
|
995
|
-
def forecast_stock_price3(dfprice,n_nextdays=1,n_samples=252*3, \
|
996
|
-
maxk=20,maxFN=252*3,random_state=0):
|
997
|
-
"""
|
998
|
-
功能:预测未来第几天的股票收盘价,试验单个随机数种子策略。可作为独立进程
|
999
|
-
输入参数:
|
1000
|
-
1、dfprice: 抓取的股价数据集
|
1001
|
-
2、n_nextdays: 以atdate为基准向前推进几个交易日,预测该日期的股价
|
1002
|
-
3、n_samples: 生成机器学习用的样本中的最大观察数目。
|
1003
|
-
跨年的样本有助于模型学习季节性效应,3年的样本效果好于2年,
|
1004
|
-
2年的样本效果好于1年
|
1005
|
-
4、maxk:试探的最大邻居个数
|
1006
|
-
5、maxFN:试探的最大特征个数
|
1007
|
-
6、random_state: 随机数种子
|
1008
|
-
输出:单次迭代取得更好的测试集分数时,输出模型参数和预测的股价
|
1009
|
-
返回:最优测试集的模型参数及预测的股价。
|
1010
|
-
"""
|
1011
|
-
#显示进程号
|
1012
|
-
import multiprocessing as mp
|
1013
|
-
pname=mp.current_process().name
|
1014
|
-
print("... Starting sub-process",pname,"with random_state",random_state)
|
1015
|
-
|
1016
|
-
#试探一个随机数种子的最佳分数
|
1017
|
-
pred_y0,bestscore_train0,bestscore_test0,bestfeature0,bestk0= \
|
1018
|
-
forecast_stock_price2(dfprice,n_nextdays=n_nextdays, \
|
1019
|
-
n_samples=n_samples,maxk=maxk, \
|
1020
|
-
maxFN=maxFN,random_state=random_state)
|
1021
|
-
#记录中间结果
|
1022
|
-
import pandas as pd
|
1023
|
-
row=pd.Series({'random_state':random_state,'pred_y':pred_y0, \
|
1024
|
-
'bestscore_train':bestscore_train0,'bestscore_test':bestscore_test0, \
|
1025
|
-
'bestfeature':bestfeature0,'bestk':bestk0})
|
1026
|
-
|
1027
|
-
print("... Endting sub-process",pname)
|
1028
|
-
return row
|
1029
|
-
|
1030
|
-
if __name__ =="__main__":
|
1031
|
-
ticker='MSFT'
|
1032
|
-
atdate="2020-4-5"
|
1033
|
-
n_nextdays=1
|
1034
|
-
random_state=0
|
1035
|
-
info=forecast_stock_price3(dfprice,n_nextdays,random_state=random_state)
|
1036
|
-
print(info)
|
1037
|
-
|
1038
|
-
#==============================================================================
|
1039
|
-
def forecast_stock_price_mp(ticker,atdate,n_nextdays=1,n_samples=252*3, \
|
1040
|
-
maxk=20,maxFN=252*3,random_state=0,maxRS=9,top=5):
|
1041
|
-
"""
|
1042
|
-
功能:预测未来第几天的股票收盘价,试验随机数种子策略,多进程
|
1043
|
-
输入参数:
|
1044
|
-
1、ticker: 股票代码
|
1045
|
-
2、atdate: 当前日期,可以是今天或以前的一天
|
1046
|
-
3、n_nextdays: 以atdate为基准向前推进几个交易日,预测该日期的股价
|
1047
|
-
4、n_samples: 生成机器学习用的样本中的最大观察数目。
|
1048
|
-
跨年的样本有助于模型学习季节性效应,3年的样本效果好于2年,
|
1049
|
-
2年的样本效果好于1年
|
1050
|
-
5、maxk:试探的最大邻居个数
|
1051
|
-
6、maxFN:试探的最大特征个数
|
1052
|
-
7、random_state: 开始试探时的随机数种子
|
1053
|
-
8、maxRS: 用于试探的最大的随机数种子
|
1054
|
-
9、top: 最后中参与计算加权中位数的个数
|
1055
|
-
输出:每次迭代取得更好的测试集分数时,输出模型参数和预测的股价
|
1056
|
-
返回:最优测试集的模型参数及预测的股价,以及各个迭代最优结果下预测的股价的
|
1057
|
-
加权中位数,权重为各个测试集分数。
|
1058
|
-
"""
|
1059
|
-
#调试开关
|
1060
|
-
DEBUG=True
|
1061
|
-
|
1062
|
-
#检查日期的合理性
|
1063
|
-
if not isdate(atdate):
|
1064
|
-
print(".Error(forecast_stock_price_rs), invalid date:",atdate)
|
1065
|
-
return None
|
1066
|
-
|
1067
|
-
#开始信息
|
1068
|
-
print("\n... Predicting stock price by knn model ......")
|
1069
|
-
print(" Stock:",ticker)
|
1070
|
-
print(" Observation date:",atdate)
|
1071
|
-
print(" Number of trading day(s) being predicted:",n_nextdays)
|
1072
|
-
print(" Max number of historical prices used:",n_samples)
|
1073
|
-
print(" Max number of features used in knn:",maxFN)
|
1074
|
-
print(" Max number of neighbors used in knn:",maxk)
|
1075
|
-
print(" Max number of obs used in weighted meadian:",top)
|
1076
|
-
print(" WARNING: It may take long time, please wait ......")
|
1077
|
-
#开始计时
|
1078
|
-
import time; time0 = time.perf_counter()
|
1079
|
-
|
1080
|
-
print("\n... Capturing historical stock prices ......",end='')
|
1081
|
-
#设定起始日期:
|
1082
|
-
nyears=int((n_nextdays + n_samples + maxFN + 1)/252)+1
|
1083
|
-
start=date_adjust(atdate,-366*nyears)
|
1084
|
-
#抓取股价数据
|
1085
|
-
dfprice=get_stock_price(ticker,atdate,start)
|
1086
|
-
if dfprice is None:
|
1087
|
-
print("\n Error(forecast_stock_price_mp), failed to capture stock prices:",ticker)
|
1088
|
-
return None
|
1089
|
-
if len(dfprice) < (n_nextdays + n_samples + maxFN + 1):
|
1090
|
-
print("\n Error(forecast_stock_price_mp), insufficient number of stock prices!")
|
1091
|
-
return None
|
1092
|
-
print(", done!")
|
1093
|
-
print(" ",len(dfprice),"historical stock prices captured")
|
1094
|
-
|
1095
|
-
print("... Start machine-learning using knn model in multiprocessing ......")
|
1096
|
-
#倒序随机数种子,便于尽快看到最优结果
|
1097
|
-
rslist=list(range(random_state,maxRS+1)); rslist.reverse()
|
1098
|
-
jobnum=len(rslist)
|
1099
|
-
|
1100
|
-
#电脑CPU核心数
|
1101
|
-
import os; cores=os.cpu_count()
|
1102
|
-
print(" There are",cores,"core(s) inside the cpu of this computer")
|
1103
|
-
#确定进程池大小
|
1104
|
-
if cores <= 4: procnum=cores+1
|
1105
|
-
else: procnum=cores
|
1106
|
-
#确定多进程分组组数
|
1107
|
-
groupnum=int(jobnum / procnum); remain=jobnum % procnum
|
1108
|
-
if remain > 0: groupnum+=1
|
1109
|
-
group=list(range(groupnum))
|
1110
|
-
|
1111
|
-
#建立数据集:记录各个进程输出结果
|
1112
|
-
import pandas as pd
|
1113
|
-
result=pd.DataFrame(columns=('random_state','pred_y','bestscore_train', \
|
1114
|
-
'bestscore_test','bestfeature','bestk'))
|
1115
|
-
#分组多任务
|
1116
|
-
import multiprocessing as mp
|
1117
|
-
for g in group:
|
1118
|
-
grpstart=g*procnum; grpend=(g+1)*procnum
|
1119
|
-
if grpend > jobnum: grpend=jobnum
|
1120
|
-
|
1121
|
-
#创建进程池
|
1122
|
-
timep0 = time.perf_counter()
|
1123
|
-
pool=mp.Pool(processes=procnum)
|
1124
|
-
print("\n... Pool",g,"created with max capacity of",procnum,"processes in parallel")
|
1125
|
-
#建立多进程
|
1126
|
-
mptasks=[pool.apply_async(forecast_stock_price3,args=(dfprice,n_nextdays, \
|
1127
|
-
n_samples,maxk,maxFN,i,)) for i in list(range(grpstart,grpend))]
|
1128
|
-
pool.close()
|
1129
|
-
pool.join()
|
1130
|
-
|
1131
|
-
#记录组内各个最佳结果
|
1132
|
-
for res in mptasks:
|
1133
|
-
row=res.get()
|
1134
|
-
result=result.append(row,ignore_index=True)
|
1135
|
-
print(" Completed processes for random_state",list(range(grpstart,grpend)))
|
1136
|
-
h0,wmp0=multisummary(result[grpstart:grpend+1],notes="Pool "+str(g),top=top)
|
1137
|
-
#计算组内运行时间
|
1138
|
-
timep1 = time.perf_counter(); elapsedp=timep1 - timep0
|
1139
|
-
print(" Elapsed time in Pool",g,"is",second2time(elapsedp))
|
1140
|
-
|
1141
|
-
#排序最后结果
|
1142
|
-
result.sort_values(by=['bestscore_test'],ascending=True,inplace=True)
|
1143
|
-
|
1144
|
-
#显示结果
|
1145
|
-
hsotest,wmprice=multisummary(result,'final',top)
|
1146
|
-
|
1147
|
-
#计算总体运行时间
|
1148
|
-
time1 = time.perf_counter(); elapsed=time1 - time0
|
1149
|
-
print("\n... Total elapsed time is",second2time(elapsed))
|
1150
|
-
|
1151
|
-
return hsotest,wmprice
|
1152
|
-
|
1153
|
-
if __name__ =="__main__":
|
1154
|
-
ticker='MSFT'
|
1155
|
-
atdate="2020-4-5"
|
1156
|
-
n_nextdays=1
|
1157
|
-
minRS=0
|
1158
|
-
maxRS=2
|
1159
|
-
predicted_prices=forecast_stock_price_mp(ticker,atdate,n_nextdays, \
|
1160
|
-
random_state=minRS,maxRS=maxRS)
|
1161
|
-
|
1162
|
-
#==============================================================================
|
1163
|
-
#==============================================================================
|
1164
|
-
#==============================================================================
|
1165
|
-
|
1166
|
-
def bestR1(X,y):
|
1167
|
-
"""
|
1168
|
-
功能:给定特征矩阵和标签,使用岭回归,返回最优的alpha参数和模型
|
1169
|
-
最优策略:测试集分数最高,不管过拟合问题
|
1170
|
-
"""
|
1171
|
-
|
1172
|
-
import numpy as np
|
1173
|
-
#将整个样本随机分割为训练集和测试集
|
1174
|
-
from sklearn.model_selection import train_test_split
|
1175
|
-
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)
|
1176
|
-
|
1177
|
-
#初始化alpha,便于判断上行下行方向
|
1178
|
-
alphalist=[0.001,0.0011,0.00999,0.01,0.01001,0.999,1,1.01, \
|
1179
|
-
9.99,10,10.01,99,100,101,999,1000,1001,10000]
|
1180
|
-
|
1181
|
-
from sklearn.linear_model import RidgeCV
|
1182
|
-
reg=RidgeCV(alphas=alphalist,cv=5,fit_intercept=True,normalize=True)
|
1183
|
-
|
1184
|
-
reg.fit(X_train, y_train)
|
1185
|
-
score_train=reg.score(X_train, y_train)
|
1186
|
-
score_test=reg.score(X_test, y_test)
|
1187
|
-
alpha=reg.alpha_
|
1188
|
-
#print("%.5f, %.5f, %.5f"%(alpha,score_train,score_test))
|
1189
|
-
|
1190
|
-
#确定alpha参数的优化范围
|
1191
|
-
if alpha in [0.001,0.01,1,2,10,100,1000,10000]:
|
1192
|
-
#print("%.5f, %.5f, %.5f"%(alpha,score_train,score_test))
|
1193
|
-
return reg,alpha,score_train,score_test
|
1194
|
-
|
1195
|
-
if 0.001 < alpha < 0.01:
|
1196
|
-
alphalist1=np.arange(0.001,0.01,0.0005)
|
1197
|
-
if 0.01 < alpha < 1:
|
1198
|
-
alphalist1=np.arange(0.01,1,0.005)
|
1199
|
-
if 1 < alpha < 10:
|
1200
|
-
alphalist1=np.arange(1,10,0.01)
|
1201
|
-
if 10 < alpha < 100:
|
1202
|
-
alphalist1=np.arange(10,100,0.1)
|
1203
|
-
if 100 < alpha < 1000:
|
1204
|
-
alphalist1=np.arange(100,1000,1)
|
1205
|
-
if 1000 < alpha < 10000:
|
1206
|
-
alphalist1=np.arange(1000,10000,10)
|
1207
|
-
|
1208
|
-
reg1=RidgeCV(alphas=alphalist1,cv=5,fit_intercept=True,normalize=True)
|
1209
|
-
reg1.fit(X_train, y_train)
|
1210
|
-
score1_train=reg1.score(X_train,y_train)
|
1211
|
-
score1_test =reg1.score(X_test, y_test)
|
1212
|
-
alpha1=reg1.alpha_
|
1213
|
-
|
1214
|
-
#print("%.5f, %.5f, %.5f"%(alpha1,score1_train,score1_test))
|
1215
|
-
return reg1,alpha1,score1_train,score1_test
|
1216
|
-
|
1217
|
-
|
1218
|
-
if __name__=='__main__':
|
1219
|
-
dfprice=get_stock_price('MSFT','4/3/2019','1/1/2015')
|
1220
|
-
X,y,ndprice=make_price_sample(dfprice,1,240,20)
|
1221
|
-
|
1222
|
-
model,alpha,score_train,score_test=bestR1(X,y)
|
1223
|
-
print("%.5f, %.5f, %.5f"%(alpha,score_train,score_test))
|
1224
|
-
#结果:0.045,0.9277,0.8940
|
1225
|
-
|
1226
|
-
X_new=ndprice[0,0:20]
|
1227
|
-
y_new=model.predict(X_new)
|
1228
|
-
print("%.2f"%y_new)
|
1229
|
-
#结果:119.43
|
1230
|
-
#==============================================================================
|
1231
|
-
def bestL1(X,y):
|
1232
|
-
"""
|
1233
|
-
功能:给定特征矩阵和标签,使用拉索回归,返回最优的alpha参数和模型
|
1234
|
-
最优策略:测试集分数最高,不管过拟合问题
|
1235
|
-
"""
|
1236
|
-
import numpy as np
|
1237
|
-
#将整个样本随机分割为训练集和测试集
|
1238
|
-
from sklearn.utils import column_or_1d
|
1239
|
-
y=column_or_1d(y,warn=False)
|
1240
|
-
from sklearn.model_selection import train_test_split
|
1241
|
-
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)
|
1242
|
-
|
1243
|
-
|
1244
|
-
#初始alpha,便于判断上行下行方向
|
1245
|
-
alphalist=[0.001,0.0011,0.00999,0.01,0.01001,0.999,1,1.01,1.99,2,2.01, \
|
1246
|
-
9.99,10,10.01,99,100,101,999,1000,1001,10000]
|
1247
|
-
|
1248
|
-
from sklearn.linear_model import LassoCV
|
1249
|
-
reg=LassoCV(alphas=alphalist,max_iter=10**6, \
|
1250
|
-
cv=5,fit_intercept=True,normalize=True)
|
1251
|
-
reg.fit(X_train, y_train)
|
1252
|
-
score_train=reg.score(X_train,y_train)
|
1253
|
-
score_test =reg.score(X_test, y_test)
|
1254
|
-
alpha=reg.alpha_
|
1255
|
-
#print("Step0: %.4f, %.5f, %.5f"%(alpha,score_train,score_test))
|
1256
|
-
|
1257
|
-
#确定alpha参数的优化范围
|
1258
|
-
if alpha in [0.001,0.01,1,2,10,100,1000,10000]:
|
1259
|
-
#print("Step01: %.5f, %.5f, %.5f"%(alpha,score_train,score_test))
|
1260
|
-
return reg,alpha,score_train,score_test
|
1261
|
-
|
1262
|
-
if 0.001 < alpha < 0.01:
|
1263
|
-
alphalist1=np.arange(0.0015,0.01,0.0005)
|
1264
|
-
|
1265
|
-
if 0.01 < alpha < 1:
|
1266
|
-
alphalist1=np.arange(0.015,1,0.005)
|
1267
|
-
|
1268
|
-
if 1 < alpha < 10:
|
1269
|
-
alphalist1=np.arange(1.01,10,0.01)
|
1270
|
-
|
1271
|
-
if 10 < alpha < 100:
|
1272
|
-
alphalist1=np.arange(10.1,100,0.1)
|
1273
|
-
|
1274
|
-
if 100 < alpha < 1000:
|
1275
|
-
alphalist1=np.arange(101,1000,1)
|
1276
|
-
|
1277
|
-
if 1000 < alpha < 10000:
|
1278
|
-
alphalist1=np.arange(1010,10000,10)
|
1279
|
-
|
1280
|
-
reg1=LassoCV(alphas=alphalist1,cv=5,fit_intercept=True,normalize=True)
|
1281
|
-
reg1.fit(X_train, y_train)
|
1282
|
-
score1_train=reg1.score(X_train,y_train)
|
1283
|
-
score1_test =reg1.score(X_test, y_test)
|
1284
|
-
alpha1=reg1.alpha_
|
1285
|
-
#print("Step1: %.4f, %.5f, %.5f"%(alpha1,score1_train,score1_test))
|
1286
|
-
return reg1,alpha1,score1_train,score1_test
|
1287
|
-
|
1288
|
-
if __name__=='__main__':
|
1289
|
-
dfprice=get_stock_price('MSFT','4/3/2019','1/1/2015')
|
1290
|
-
X,y,ndprice=make_price_sample(dfprice,1,240,20)
|
1291
|
-
|
1292
|
-
model,alpha,score_train,score_test=bestL1(X,y)
|
1293
|
-
print("%.5f, %.5f, %.5f"%(alpha,score_train,score_test))
|
1294
|
-
#结果:0.015,0.9284,0.9043
|
1295
|
-
|
1296
|
-
X_new=ndprice[0,0:20]
|
1297
|
-
y_new=model.predict(X_new)
|
1298
|
-
print("%.2f"%y_new)
|
1299
|
-
#结果:119.37
|
1300
|
-
|
1301
|
-
#==============================================================================
|
1302
|
-
|
1303
|
-
def bestEN2(X,y,maxalpha=2):
|
1304
|
-
"""
|
1305
|
-
功能:给定特征矩阵和标签,使用弹性网络回归,返回最优的alpha参数和模型
|
1306
|
-
最优策略:利用ElasticNetCV筛选机制,速度慢
|
1307
|
-
"""
|
1308
|
-
#将整个样本随机分割为训练集和测试集
|
1309
|
-
from sklearn.utils import column_or_1d
|
1310
|
-
y=column_or_1d(y,warn=False)
|
1311
|
-
|
1312
|
-
from sklearn.model_selection import train_test_split
|
1313
|
-
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=66)
|
1314
|
-
|
1315
|
-
#限定参数范围
|
1316
|
-
import numpy as np
|
1317
|
-
alphalist=np.arange(0.01,maxalpha,0.01)
|
1318
|
-
l1list =np.arange(0.01,1,0.01)
|
1319
|
-
|
1320
|
-
from sklearn.linear_model import ElasticNetCV
|
1321
|
-
reg=ElasticNetCV(alphas=alphalist,l1_ratio=l1list)
|
1322
|
-
|
1323
|
-
reg.fit(X_train, y_train)
|
1324
|
-
score_train=reg.score(X_train,y_train)
|
1325
|
-
score_test =reg.score(X_test, y_test)
|
1326
|
-
alpha=reg.alpha_
|
1327
|
-
l1ratio=reg.l1_ratio_
|
1328
|
-
|
1329
|
-
return reg,alpha,l1ratio,score_train,score_test
|
1330
|
-
|
1331
|
-
if __name__=='__main__':
|
1332
|
-
dfprice=get_stock_price('MSFT','4/3/2019','1/1/2015')
|
1333
|
-
X,y,ndprice=make_price_sample(dfprice,1,240,20)
|
1334
|
-
|
1335
|
-
model,alpha,l1ratio,score_train,score_test=bestEN2(X,y)
|
1336
|
-
print("%.5f, %.5f, %.5f, %.5f"%(alpha,l1ratio,score_train,score_test))
|
1337
|
-
#结果:0.42,0.99,0.9258,0.9174
|
1338
|
-
|
1339
|
-
X_new=ndprice[0,0:20]
|
1340
|
-
y_new=model.predict(X_new)
|
1341
|
-
print("%.2f"%y_new)
|
1342
|
-
#结果:119.60
|
1343
|
-
|
1344
|
-
|
1345
|
-
#==============================================================================
|
1346
|
-
|
1347
|
-
def bestEN3(X,y):
|
1348
|
-
"""
|
1349
|
-
功能:给定特征矩阵和标签,使用弹性网络回归,返回最优的alpha参数和模型
|
1350
|
-
最优策略:利用cv交叉验证,速度快
|
1351
|
-
算法贡献者:徐乐欣(韩语国商)
|
1352
|
-
"""
|
1353
|
-
import numpy as np
|
1354
|
-
#将整个样本随机分割为训练集和测试集
|
1355
|
-
from sklearn.utils import column_or_1d
|
1356
|
-
y=column_or_1d(y,warn=False)
|
1357
|
-
from sklearn.model_selection import train_test_split
|
1358
|
-
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=66)
|
1359
|
-
|
1360
|
-
from sklearn.linear_model import ElasticNetCV
|
1361
|
-
#reg=ElasticNetCV(cv=5, random_state=0)
|
1362
|
-
#reg.fit(X,y)
|
1363
|
-
|
1364
|
-
l1list=np.arange(0.01,1,0.01)
|
1365
|
-
ENet=ElasticNetCV(alphas=None, copy_X=True, cv=5, eps=0.001, \
|
1366
|
-
fit_intercept=True,l1_ratio=l1list, max_iter=8000, \
|
1367
|
-
n_alphas=100, n_jobs=None,normalize=True, \
|
1368
|
-
positive=False, precompute='auto', random_state=0, \
|
1369
|
-
selection='cyclic', tol=0.0001, verbose=0)
|
1370
|
-
ENet.fit(X_train, y_train)
|
1371
|
-
score_train=ENet.score(X_train, y_train)
|
1372
|
-
score_test=ENet.score(X_test, y_test)
|
1373
|
-
alpha=ENet.alpha_
|
1374
|
-
l1ratio=ENet.l1_ratio_
|
1375
|
-
#print("S1: %.5f, %.5f, %.5f, %.5f"%(alpha,l1ratio,score_train,score_test))
|
1376
|
-
|
1377
|
-
return ENet,alpha,l1ratio,score_train,score_test
|
1378
|
-
|
1379
|
-
if __name__=='__main__':
|
1380
|
-
dfprice=get_stock_price('MSFT','4/3/2019','1/1/2015')
|
1381
|
-
X,y,ndprice=make_price_sample(dfprice,1,240,20)
|
1382
|
-
|
1383
|
-
model,alpha,l1ratio,score_train,score_test=bestEN3(X,y)
|
1384
|
-
print("%.5f, %.5f, %.5f, %.5f"%(alpha,l1ratio,score_train,score_test))
|
1385
|
-
#结果:0.005836,0.99,0.925,0.9194
|
1386
|
-
|
1387
|
-
X_new=ndprice[0,0:20]
|
1388
|
-
y_new=model.predict(X_new)
|
1389
|
-
print("%.2f"%y_new)
|
1390
|
-
#结果:119.48
|
1391
|
-
#==============================================================================
|
1392
|
-
|
1393
|
-
|
1394
|
-
def bestEN1(X,y,maxalpha=2):
|
1395
|
-
"""
|
1396
|
-
功能:给定特征矩阵和标签,使用弹性网络回归,返回最优的alpha参数和模型
|
1397
|
-
最优策略:对alpha和l1_ratio进行暴力枚举,搜索最高测试集分数,速度中等
|
1398
|
-
算法贡献者:徐乐欣(韩语国商)
|
1399
|
-
"""
|
1400
|
-
|
1401
|
-
#将整个样本随机分割为训练集和测试集
|
1402
|
-
from sklearn.utils import column_or_1d
|
1403
|
-
y=column_or_1d(y,warn=False)
|
1404
|
-
from sklearn.model_selection import train_test_split
|
1405
|
-
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=66)
|
1406
|
-
|
1407
|
-
#设立初始测试集分数门槛
|
1408
|
-
king_score=0.6
|
1409
|
-
from sklearn.linear_model import ElasticNet
|
1410
|
-
|
1411
|
-
#限定参数范围
|
1412
|
-
import numpy as np
|
1413
|
-
alphalist=np.arange(0.01,maxalpha,0.01)
|
1414
|
-
l1list =np.arange(0.01,1,0.01)
|
1415
|
-
|
1416
|
-
for i in alphalist:
|
1417
|
-
for j in l1list:
|
1418
|
-
reg=ElasticNet(alpha=i,l1_ratio=j)
|
1419
|
-
reg.fit(X_train,y_train)
|
1420
|
-
temp_score=reg.score(X_test,y_test)
|
1421
|
-
if temp_score > king_score:
|
1422
|
-
king_score=temp_score
|
1423
|
-
alpha=i
|
1424
|
-
l1ratio=j
|
1425
|
-
score_train=reg.score(X_train,y_train)
|
1426
|
-
score_test=temp_score
|
1427
|
-
model=reg
|
1428
|
-
|
1429
|
-
return model,alpha,l1ratio,score_train,score_test
|
1430
|
-
|
1431
|
-
if __name__=='__main__':
|
1432
|
-
dfprice=get_stock_price('MSFT','4/3/2019','1/1/2015')
|
1433
|
-
X,y,ndprice=make_price_sample(dfprice,1,240,20)
|
1434
|
-
|
1435
|
-
model,alpha,l1ratio,score_train,score_test=bestEN1(X,y)
|
1436
|
-
print("%.5f, %.5f, %.5f, %.5f"%(alpha,l1ratio,score_train,score_test))
|
1437
|
-
#结果:1.31,0.56,0.9241,0.9196
|
1438
|
-
|
1439
|
-
X_new=ndprice[0,0:20]
|
1440
|
-
y_new=model.predict(X_new)
|
1441
|
-
print("%.2f"%y_new)
|
1442
|
-
#结果:119.36
|
1443
|
-
|
1444
|
-
|
1445
|
-
#==============================================================================
|
1446
|
-
#==============================================================================
|
1447
|
-
#==============================================================================
|
1448
|
-
|
1449
|
-
def get_portfolio(tickerlist,sharelist,atdate,fromdate):
|
1450
|
-
"""
|
1451
|
-
功能:抓取投资组合的市值
|
1452
|
-
输出:指定投资组合的收盘价格序列,最新日期的股价排列在前
|
1453
|
-
tickerlist:投资组合中各个股票的代码列表
|
1454
|
-
sharelist:投资组合中各个股票的份额列表
|
1455
|
-
atdate:当前日期,既可以是今天日期,也可以是一个历史日期,datetime类型
|
1456
|
-
fromdate:样本开始日期,尽量远的日期,以便取得足够多的原始样本,类型同atdate
|
1457
|
-
"""
|
1458
|
-
|
1459
|
-
#仅为调试用的函数入口参数,正式使用前需要注释掉!
|
1460
|
-
#tickerlist=['BILI','PDD']
|
1461
|
-
#sharelist=[0.67,0.33]
|
1462
|
-
#sum(sharelist)
|
1463
|
-
#atdate='4/12/2019'
|
1464
|
-
#fromdate='1/1/2015'
|
1465
|
-
#---------------------------------------------
|
1466
|
-
|
1467
|
-
#检查投资组合的份额是否等于1
|
1468
|
-
if sum(sharelist) != 1.0:
|
1469
|
-
print(".Error: sum of all shares in the portfolio is not 1")
|
1470
|
-
return None
|
1471
|
-
|
1472
|
-
#抓取股票价格
|
1473
|
-
price=get_prices_portfolio(tickerlist,sharelist,atdate,fromdate)
|
1474
|
-
|
1475
|
-
#去掉比起始日期更早的样本
|
1476
|
-
price2=price[price.index >= fromdate]
|
1477
|
-
|
1478
|
-
|
1479
|
-
#按日期降序排序,近期的价格排在前面
|
1480
|
-
sortedprice=price2.sort_index(axis=0,ascending=False)
|
1481
|
-
|
1482
|
-
#提取日期和星期几
|
1483
|
-
#sortedprice['Date']=sortedprice.index.date
|
1484
|
-
sortedprice['Date']=sortedprice.index.strftime("%Y-%m-%d")
|
1485
|
-
sortedprice['Weekday']=sortedprice.index.weekday+1
|
1486
|
-
|
1487
|
-
#合成投资组合的价值
|
1488
|
-
dfprice=sortedprice[['Date','Weekday','Close']]
|
1489
|
-
import copy
|
1490
|
-
dfprice2= copy.deepcopy(dfprice)
|
1491
|
-
dfprice2['Value']=0.0
|
1492
|
-
rownames=dfprice.columns.values.tolist()
|
1493
|
-
for i in range(2,len(rownames)):
|
1494
|
-
value=dfprice2[('Close',rownames[i][1])]*sharelist[i-2]
|
1495
|
-
dfprice2['Value']=dfprice2['Value']+value
|
1496
|
-
|
1497
|
-
#生成输出
|
1498
|
-
import pandas as pd
|
1499
|
-
dfprice3=pd.DataFrame(columns=['Date','Weekday','Close'])
|
1500
|
-
dfprice3['Date']=dfprice2['Date']
|
1501
|
-
dfprice3['Weekday']=dfprice2['Weekday']
|
1502
|
-
dfprice3['Close']=dfprice2['Value']
|
1503
|
-
dfprice4=dfprice3.dropna()
|
1504
|
-
return dfprice4
|
1505
|
-
|
1506
|
-
|
1507
|
-
if __name__=='__main__':
|
1508
|
-
dfprice=get_portfolio(['BILI','PDD'],[0.67,0.33],'4/12/2019','1/1/2015')
|
1509
|
-
dfprice.head(5)
|
1510
|
-
dfprice.tail(3)
|
1511
|
-
dfprice[dfprice.Date == '2019-03-29']
|
1512
|
-
dfprice[(dfprice.Date>='2019-03-20') & (dfprice.Date<='2019-03-29')]
|
1513
|
-
|
1514
|
-
dfindex=get_price('^GSPC','4/12/2019','1/1/2015')
|
1515
|
-
|
1516
|
-
#=====================================================================
|
1517
|
-
#=====================================================================
|
1518
|
-
|
1519
|
-
def tradefee(price,n_shares=1,trade='buy'):
|
1520
|
-
"""
|
1521
|
-
返回买卖1块金额股票证券交易的总费用
|
1522
|
-
trade: buy=买,sell=卖。区分买卖的原因是买卖手续费可能不同
|
1523
|
-
注意:印花税和券商手续费等与交易金额有关,而过户费与交易股数有关
|
1524
|
-
设立此函数的原因:各个国家、股票交易所和券商的收费方式和标准存在差异
|
1525
|
-
为简单起见,教学演示统一为买入时千分之二卖出时千分之三,实际应用时可再进行改造
|
1526
|
-
"""
|
1527
|
-
if trade =='buy' : fee=price*n_shares*0.002
|
1528
|
-
if trade =='sell': fee=price*n_shares*0.003
|
1529
|
-
if not (trade in ['buy','sell']): print("Invalid trade")
|
1530
|
-
return fee
|
1531
|
-
|
1532
|
-
#=====================================================================
|
1533
|
-
def make_advice_sample(dfprice,dfindex,n_nextdays=10, \
|
1534
|
-
n_samples=120,n_features=20, \
|
1535
|
-
samplingtype='AR',n_shares=1):
|
1536
|
-
"""
|
1537
|
-
功能:生成指定股票的样本
|
1538
|
-
n_nextdays:预测从atdate开始未来第几天的股价,默认为1
|
1539
|
-
n_samples:需要生成的样本个数,默认240个(一年的平均交易天数)
|
1540
|
-
n_features:使用的特征数量,默认20个(一个月的平均交易天数)
|
1541
|
-
n_shares: 买卖的股数,默认1股
|
1542
|
-
samplingtype:样本构造方法,AR=使用历史超额收益率,JG=使用历史投资结果
|
1543
|
-
"""
|
1544
|
-
|
1545
|
-
#仅为测试用
|
1546
|
-
#n_shares=1
|
1547
|
-
#n_nextdays=5
|
1548
|
-
#n_samples=240
|
1549
|
-
#n_features=20
|
1550
|
-
#samplingtype='AR'
|
1551
|
-
|
1552
|
-
#提取收盘价和市场指数,Series类型
|
1553
|
-
closeprice=dfprice.Close
|
1554
|
-
maxrec=len(closeprice)
|
1555
|
-
closeindex=dfindex.Close
|
1556
|
-
|
1557
|
-
#转换为机器学习需要的ndarray类型
|
1558
|
-
import numpy as np
|
1559
|
-
ndprice=np.asmatrix(closeprice,dtype=None)
|
1560
|
-
ndindex=np.asmatrix(closeindex,dtype=None)
|
1561
|
-
|
1562
|
-
#建立数组并赋初值
|
1563
|
-
ndRs=np.asmatrix([[0.0 for i in range(maxrec)] for j in range(1)])
|
1564
|
-
ndRm=np.asmatrix([[0.0 for i in range(maxrec)] for j in range(1)])
|
1565
|
-
ndAR=np.asmatrix([[0.0 for i in range(maxrec)] for j in range(1)])
|
1566
|
-
ndJG=np.asmatrix([[-1 for i in range(maxrec)] for j in range(1)])
|
1567
|
-
|
1568
|
-
for i in range(0,maxrec-n_nextdays-1):
|
1569
|
-
#print("i=",i)
|
1570
|
-
#计算股票买卖收益率
|
1571
|
-
sellprice=ndprice[0,i]
|
1572
|
-
sellfee=tradefee(sellprice,n_shares,trade='sell')
|
1573
|
-
sellgain=sellprice*n_shares - sellfee
|
1574
|
-
buyprice=ndprice[0,i+n_nextdays]
|
1575
|
-
buyfee=tradefee(buyprice,n_shares,trade='buy')
|
1576
|
-
buycost=buyprice*n_shares + buyfee
|
1577
|
-
Rs=(sellgain-buycost)/buycost
|
1578
|
-
ndRs[0,i]=Rs
|
1579
|
-
|
1580
|
-
#计算市场指数收益率
|
1581
|
-
Rm=(ndindex[0,i]-ndindex[0,i+n_nextdays])/ndindex[0,i+n_nextdays]
|
1582
|
-
ndRm[0,i]=Rm
|
1583
|
-
AR=Rs - Rm
|
1584
|
-
ndAR[0,i]=AR
|
1585
|
-
|
1586
|
-
#评估投资结果
|
1587
|
-
if (Rs>0) & (AR>0): JG=1
|
1588
|
-
else: JG=-1
|
1589
|
-
ndJG[0,i]=JG
|
1590
|
-
|
1591
|
-
#生成第一个标签样本:标签矩阵y(形状:n_samples x 1)
|
1592
|
-
y=np.asmatrix(ndJG[0,0])
|
1593
|
-
#生成第一个特征样本:特征矩阵X(形状:n_samples x n_features)
|
1594
|
-
#如果样本构造类型为AR,则以ndAR作为特征,否则以JG作为特征
|
1595
|
-
if samplingtype=='AR': ndfeature=ndAR
|
1596
|
-
else: ndfeature=ndJG
|
1597
|
-
|
1598
|
-
#row,col=ndfeature.shape
|
1599
|
-
X=ndfeature[0,(n_nextdays+1):(n_features+n_nextdays+1)]
|
1600
|
-
|
1601
|
-
#生成其余的标签样本和特征样本
|
1602
|
-
for i in range(1,n_samples):
|
1603
|
-
#print("i=",i)
|
1604
|
-
|
1605
|
-
|
1606
|
-
X_row=ndfeature[0,(n_nextdays+1+i):(n_features+n_nextdays+1+i)]
|
1607
|
-
m,n=X_row.shape
|
1608
|
-
if n == n_features:
|
1609
|
-
X=np.append(X,X_row,axis=0)
|
1610
|
-
y_row=np.asmatrix(ndJG[0,i])
|
1611
|
-
y=np.append(y,y_row,axis=0)
|
1612
|
-
else:
|
1613
|
-
print("\nInsufficient number of samples, try use smaller parms")
|
1614
|
-
print(" Number of samples:",maxrec)
|
1615
|
-
break #跳出for循环,注意continue只是跳出当前循环就如下一次循环
|
1616
|
-
|
1617
|
-
return X,y,ndfeature
|
1618
|
-
|
1619
|
-
if __name__=='__main__':
|
1620
|
-
ticker='MSFT'
|
1621
|
-
market='^GSPC'
|
1622
|
-
dfprice=get_price(ticker,'4/12/2019','1/1/2015')
|
1623
|
-
dfindex=get_price(market,'4/12/2019','1/1/2015')
|
1624
|
-
|
1625
|
-
X,y,ndfeature=make_advice_sample(dfprice,dfindex,120,480,120,'AR')
|
1626
|
-
#m,n=ndfeature.shape
|
1627
|
-
X1,y1,ndfeature1=make_advice_sample(dfprice,dfindex,5,240,20,'JG')
|
1628
|
-
|
1629
|
-
from sklearn.utils import column_or_1d
|
1630
|
-
y=column_or_1d(y,warn=False)
|
1631
|
-
from sklearn.model_selection import train_test_split
|
1632
|
-
X_train,X_test,y_train,y_test= \
|
1633
|
-
train_test_split(X,y,stratify=y,random_state=0)
|
1634
|
-
|
1635
|
-
from sklearn.linear_model import LogisticRegression
|
1636
|
-
lr=LogisticRegression(C=1, penalty='l2',solver='liblinear')
|
1637
|
-
lr.fit(X_train,y_train)
|
1638
|
-
lr.score(X_train,y_train)
|
1639
|
-
lr.score(X_test,y_test)
|
1640
|
-
X_new=ndfeature[0,0:20]
|
1641
|
-
lr.predict(X_new)
|
1642
|
-
|
1643
|
-
|
1644
|
-
y1=column_or_1d(y1,warn=False)
|
1645
|
-
from sklearn.model_selection import train_test_split
|
1646
|
-
X1_train,X1_test,y1_train,y1_test= \
|
1647
|
-
train_test_split(X1,y1,stratify=y1,random_state=0)
|
1648
|
-
|
1649
|
-
from sklearn.linear_model import LogisticRegression
|
1650
|
-
lr1=LogisticRegression(C=1, penalty='l2',solver='liblinear')
|
1651
|
-
lr1.fit(X1_train,y1_train)
|
1652
|
-
lr1.score(X1_train,y1_train)
|
1653
|
-
lr1.score(X1_test,y1_test)
|
1654
|
-
X1_new=ndfeature[0,0:20]
|
1655
|
-
lr1.predict(X1_new)
|
1656
|
-
|
1657
|
-
|
1658
|
-
ticker='PDD'
|
1659
|
-
dfprice=get_price(ticker,'4/12/2019','1/1/2015')
|
1660
|
-
len(dfprice)
|
1661
|
-
X,y,ndfeature=make_advice_sample(dfprice,dfindex,30,120,30,'AR')
|
1662
|
-
|
1663
|
-
ticker='BILI'
|
1664
|
-
dfprice=get_price(ticker,'4/12/2019','1/1/2015')
|
1665
|
-
len(dfprice)
|
1666
|
-
X,y,ndfeature=make_advice_sample(dfprice,dfindex,20,120,40,'AR')
|
1667
|
-
#=====================================================================
|
1668
|
-
def bestLRL1(X,y):
|
1669
|
-
"""
|
1670
|
-
功能:使用LogisticRegression和正则化L1参数C,获得最高测试分数
|
1671
|
-
返回:最优的C参数和模型
|
1672
|
-
"""
|
1673
|
-
|
1674
|
-
#将整个样本随机分割为训练集和测试集
|
1675
|
-
from sklearn.utils import column_or_1d
|
1676
|
-
y=column_or_1d(y,warn=False)
|
1677
|
-
from sklearn.model_selection import train_test_split
|
1678
|
-
X_train,X_test,y_train,y_test=train_test_split(X,y,stratify=y,random_state=0)
|
1679
|
-
|
1680
|
-
best_score=0.1
|
1681
|
-
import numpy as np
|
1682
|
-
from sklearn.linear_model import LogisticRegression
|
1683
|
-
|
1684
|
-
for Cvalue in np.arange(0.01,1,0.01):
|
1685
|
-
lr=LogisticRegression(C=Cvalue, penalty='l1', \
|
1686
|
-
solver='liblinear', max_iter=10000)
|
1687
|
-
lr.fit(X_train,y_train)
|
1688
|
-
score_test = lr.score(X_test, y_test)
|
1689
|
-
if score_test > best_score:
|
1690
|
-
best_score=score_test
|
1691
|
-
best_C=Cvalue
|
1692
|
-
best_model=lr
|
1693
|
-
|
1694
|
-
for Cvalue in np.arange(1,10,0.1):
|
1695
|
-
lr=LogisticRegression(C=Cvalue, penalty='l1', \
|
1696
|
-
solver='liblinear', max_iter=10000)
|
1697
|
-
lr.fit(X_train,y_train)
|
1698
|
-
score_test = lr.score(X_test, y_test)
|
1699
|
-
if score_test > best_score:
|
1700
|
-
best_score=score_test
|
1701
|
-
best_C=Cvalue
|
1702
|
-
best_model=lr
|
1703
|
-
|
1704
|
-
for Cvalue in np.arange(10,100,1):
|
1705
|
-
lr=LogisticRegression(C=Cvalue, penalty='l1', \
|
1706
|
-
solver='liblinear', max_iter=10000)
|
1707
|
-
lr.fit(X_train,y_train)
|
1708
|
-
score_test = lr.score(X_test, y_test)
|
1709
|
-
if score_test > best_score:
|
1710
|
-
best_score=score_test
|
1711
|
-
best_C=Cvalue
|
1712
|
-
best_model=lr
|
1713
|
-
|
1714
|
-
for Cvalue in np.arange(100,1000,10):
|
1715
|
-
lr=LogisticRegression(C=Cvalue, penalty='l1', \
|
1716
|
-
solver='liblinear', max_iter=10000)
|
1717
|
-
lr.fit(X_train,y_train)
|
1718
|
-
score_test = lr.score(X_test, y_test)
|
1719
|
-
if score_test > best_score:
|
1720
|
-
best_score=score_test
|
1721
|
-
best_C=Cvalue
|
1722
|
-
best_model=lr
|
1723
|
-
|
1724
|
-
score_train=best_model.score(X_train,y_train)
|
1725
|
-
return best_model,best_C,score_train,best_score
|
1726
|
-
|
1727
|
-
|
1728
|
-
if __name__=='__main__':
|
1729
|
-
ticker='MSFT'
|
1730
|
-
market='^GSPC'
|
1731
|
-
dfprice=get_price(ticker,'4/12/2019','1/1/2015')
|
1732
|
-
dfindex=get_price(market,'4/12/2019','1/1/2015')
|
1733
|
-
|
1734
|
-
X,y,ndfeature=make_advice_sample(dfprice,dfindex,5,240,20,'AR')
|
1735
|
-
model,C,score_train,score_test=bestLRL1(X,y)
|
1736
|
-
print("%.5f, %.5f, %.5f"%(C,score_train,score_test))
|
1737
|
-
#结果:14,0.66667,0.71667
|
1738
|
-
|
1739
|
-
X_new=ndfeature[0,0:20]
|
1740
|
-
y_new=model.predict(X_new)
|
1741
|
-
print("%.0f"%y_new)
|
1742
|
-
#结果:-1
|
1743
|
-
|
1744
|
-
X1,y1,ndfeature1=make_advice_sample(dfprice,dfindex,5,240,20,'JG')
|
1745
|
-
model,C,score_train,score_test=bestLRL1(X1,y1)
|
1746
|
-
print("%.5f, %.5f, %.5f"%(C,score_train,score_test))
|
1747
|
-
#结果:0.14,0.61667,0.63333
|
1748
|
-
#=====================================================================
|
1749
|
-
|
1750
|
-
def bestLRL2(X,y):
|
1751
|
-
"""
|
1752
|
-
功能:使用LogisticRegression和正则化L2参数C,获得最高测试分数
|
1753
|
-
返回:最优的C参数和模型
|
1754
|
-
"""
|
1755
|
-
|
1756
|
-
#将整个样本随机分割为训练集和测试集
|
1757
|
-
from sklearn.utils import column_or_1d
|
1758
|
-
y=column_or_1d(y,warn=False)
|
1759
|
-
from sklearn.model_selection import train_test_split
|
1760
|
-
X_train,X_test,y_train,y_test=train_test_split(X,y,stratify=y,random_state=0)
|
1761
|
-
|
1762
|
-
best_score=0.1
|
1763
|
-
import numpy as np
|
1764
|
-
from sklearn.linear_model import LogisticRegression
|
1765
|
-
|
1766
|
-
for Cvalue in np.arange(0.01,1,0.01):
|
1767
|
-
lr=LogisticRegression(C=Cvalue, penalty='l2', \
|
1768
|
-
solver='liblinear', max_iter=10000)
|
1769
|
-
lr.fit(X_train,y_train)
|
1770
|
-
score_test = lr.score(X_test, y_test)
|
1771
|
-
if score_test > best_score:
|
1772
|
-
best_score=score_test
|
1773
|
-
best_C=Cvalue
|
1774
|
-
best_model=lr
|
1775
|
-
|
1776
|
-
for Cvalue in np.arange(1,10,0.1):
|
1777
|
-
lr=LogisticRegression(C=Cvalue, penalty='l2', \
|
1778
|
-
solver='liblinear', max_iter=10000)
|
1779
|
-
lr.fit(X_train,y_train)
|
1780
|
-
score_test = lr.score(X_test, y_test)
|
1781
|
-
if score_test > best_score:
|
1782
|
-
best_score=score_test
|
1783
|
-
best_C=Cvalue
|
1784
|
-
best_model=lr
|
1785
|
-
|
1786
|
-
for Cvalue in np.arange(10,100,1):
|
1787
|
-
lr=LogisticRegression(C=Cvalue, penalty='l2', \
|
1788
|
-
solver='liblinear', max_iter=10000)
|
1789
|
-
lr.fit(X_train,y_train)
|
1790
|
-
score_test = lr.score(X_test, y_test)
|
1791
|
-
if score_test > best_score:
|
1792
|
-
best_score=score_test
|
1793
|
-
best_C=Cvalue
|
1794
|
-
best_model=lr
|
1795
|
-
|
1796
|
-
for Cvalue in np.arange(100,1000,10):
|
1797
|
-
lr=LogisticRegression(C=Cvalue, penalty='l2', \
|
1798
|
-
solver='liblinear', max_iter=10000)
|
1799
|
-
lr.fit(X_train,y_train)
|
1800
|
-
score_test = lr.score(X_test, y_test)
|
1801
|
-
if score_test > best_score:
|
1802
|
-
best_score=score_test
|
1803
|
-
best_C=Cvalue
|
1804
|
-
best_model=lr
|
1805
|
-
|
1806
|
-
score_train=best_model.score(X_train,y_train)
|
1807
|
-
return best_model,best_C,score_train,best_score
|
1808
|
-
|
1809
|
-
|
1810
|
-
if __name__=='__main__':
|
1811
|
-
ticker='MSFT'
|
1812
|
-
market='^GSPC'
|
1813
|
-
dfprice=get_price(ticker,'4/12/2019','1/1/2015')
|
1814
|
-
dfindex=get_price(market,'4/12/2019','1/1/2015')
|
1815
|
-
|
1816
|
-
X,y,ndfeature=make_advice_sample(dfprice,dfindex,5,240,20,'AR')
|
1817
|
-
model,C,score_train,score_test=bestLRL2(X,y)
|
1818
|
-
print("%.5f, %.5f, %.5f"%(C,score_train,score_test))
|
1819
|
-
#结果:33, 0.65,0.68333
|
1820
|
-
|
1821
|
-
X_new=ndfeature[0,0:20]
|
1822
|
-
y_new=model.predict(X_new)
|
1823
|
-
print("%.0f"%y_new)
|
1824
|
-
#结果:-1
|
1825
|
-
|
1826
|
-
X1,y1,ndfeature1=make_advice_sample(dfprice,dfindex,5,240,20,'JG')
|
1827
|
-
model,C,score_train,score_test=bestLRL2(X1,y1)
|
1828
|
-
print("%.5f, %.5f, %.5f"%(C,score_train,score_test))
|
1829
|
-
#结果:0.02,0.62222,0.66667
|
1830
|
-
|
1831
|
-
#==============================================================================
|
1832
|
-
|
1833
|
-
def bestLSVCL1(X,y):
|
1834
|
-
"""
|
1835
|
-
功能:使用LinearSVC和正则化L1参数C,获得最高测试分数
|
1836
|
-
返回:最优的C参数和模型
|
1837
|
-
"""
|
1838
|
-
|
1839
|
-
#将整个样本随机分割为训练集和测试集
|
1840
|
-
from sklearn.utils import column_or_1d
|
1841
|
-
y=column_or_1d(y,warn=False)
|
1842
|
-
from sklearn.model_selection import train_test_split
|
1843
|
-
X_train,X_test,y_train,y_test=train_test_split(X,y,stratify=y,random_state=0)
|
1844
|
-
|
1845
|
-
best_score=0.1
|
1846
|
-
import numpy as np
|
1847
|
-
from sklearn.svm import LinearSVC
|
1848
|
-
|
1849
|
-
for Cvalue in np.arange(0.01,1,0.01):
|
1850
|
-
lr=LinearSVC(C=Cvalue, penalty='l1',dual=False,max_iter=10**6)
|
1851
|
-
lr.fit(X_train,y_train)
|
1852
|
-
score_test = lr.score(X_test, y_test)
|
1853
|
-
if score_test > best_score:
|
1854
|
-
best_score=score_test
|
1855
|
-
best_C=Cvalue
|
1856
|
-
best_model=lr
|
1857
|
-
|
1858
|
-
for Cvalue in np.arange(1,10,0.1):
|
1859
|
-
lr=LinearSVC(C=Cvalue, penalty='l1',dual=False,max_iter=10**6)
|
1860
|
-
lr.fit(X_train,y_train)
|
1861
|
-
score_test = lr.score(X_test, y_test)
|
1862
|
-
if score_test > best_score:
|
1863
|
-
best_score=score_test
|
1864
|
-
best_C=Cvalue
|
1865
|
-
best_model=lr
|
1866
|
-
|
1867
|
-
for Cvalue in np.arange(10,100,1):
|
1868
|
-
lr=LinearSVC(C=Cvalue, penalty='l1',dual=False,max_iter=10**6)
|
1869
|
-
lr.fit(X_train,y_train)
|
1870
|
-
score_test = lr.score(X_test, y_test)
|
1871
|
-
if score_test > best_score:
|
1872
|
-
best_score=score_test
|
1873
|
-
best_C=Cvalue
|
1874
|
-
best_model=lr
|
1875
|
-
|
1876
|
-
for Cvalue in np.arange(100,1000,10):
|
1877
|
-
lr=LinearSVC(C=Cvalue, penalty='l1',dual=False,max_iter=10**6)
|
1878
|
-
lr.fit(X_train,y_train)
|
1879
|
-
score_test = lr.score(X_test, y_test)
|
1880
|
-
if score_test > best_score:
|
1881
|
-
best_score=score_test
|
1882
|
-
best_C=Cvalue
|
1883
|
-
best_model=lr
|
1884
|
-
|
1885
|
-
score_train=best_model.score(X_train,y_train)
|
1886
|
-
return best_model,best_C,score_train,best_score
|
1887
|
-
|
1888
|
-
|
1889
|
-
if __name__=='__main__':
|
1890
|
-
ticker='MSFT'
|
1891
|
-
market='^GSPC'
|
1892
|
-
dfprice=get_price(ticker,'4/12/2019','1/1/2015')
|
1893
|
-
dfindex=get_price(market,'4/12/2019','1/1/2015')
|
1894
|
-
|
1895
|
-
X,y,ndfeature=make_advice_sample(dfprice,dfindex,5,240,20,'AR')
|
1896
|
-
model,C,score_train,score_test=bestLSVCL1(X,y)
|
1897
|
-
print("%.5f, %.5f, %.5f"%(C,score_train,score_test))
|
1898
|
-
#结果:5.3, 0.66111, 0.71667
|
1899
|
-
|
1900
|
-
X_new=ndfeature[0,0:20]
|
1901
|
-
y_new=model.predict(X_new)
|
1902
|
-
print("%.0f"%y_new)
|
1903
|
-
#结果:-1
|
1904
|
-
|
1905
|
-
X1,y1,ndfeature1=make_advice_sample(dfprice,dfindex,5,240,20,'JG')
|
1906
|
-
model,C,score_train,score_test=bestLSVCL1(X1,y1)
|
1907
|
-
print("%.5f, %.5f, %.5f"%(C,score_train,score_test))
|
1908
|
-
#结果:0.04, 0.61667, 0.63333
|
1909
|
-
|
1910
|
-
|
1911
|
-
#==============================================================================
|
1912
|
-
|
1913
|
-
def bestLSVCL2(X,y):
|
1914
|
-
"""
|
1915
|
-
功能:使用LinearSVC和正则化L2参数C,获得最高测试分数
|
1916
|
-
返回:最优的C参数和模型
|
1917
|
-
"""
|
1918
|
-
|
1919
|
-
#将整个样本随机分割为训练集和测试集
|
1920
|
-
from sklearn.utils import column_or_1d
|
1921
|
-
y=column_or_1d(y,warn=False)
|
1922
|
-
from sklearn.model_selection import train_test_split
|
1923
|
-
X_train,X_test,y_train,y_test=train_test_split(X,y,stratify=y,random_state=0)
|
1924
|
-
|
1925
|
-
best_score=0.1
|
1926
|
-
import numpy as np
|
1927
|
-
from sklearn.svm import LinearSVC
|
1928
|
-
|
1929
|
-
for Cvalue in np.arange(0.01,1,0.01):
|
1930
|
-
lr=LinearSVC(C=Cvalue, penalty='l2',dual=False,max_iter=10**6)
|
1931
|
-
lr.fit(X_train,y_train)
|
1932
|
-
score_test = lr.score(X_test, y_test)
|
1933
|
-
if score_test > best_score:
|
1934
|
-
best_score=score_test
|
1935
|
-
best_C=Cvalue
|
1936
|
-
best_model=lr
|
1937
|
-
|
1938
|
-
for Cvalue in np.arange(1,10,0.1):
|
1939
|
-
lr=LinearSVC(C=Cvalue, penalty='l2',dual=False,max_iter=10**6)
|
1940
|
-
lr.fit(X_train,y_train)
|
1941
|
-
score_test = lr.score(X_test, y_test)
|
1942
|
-
if score_test > best_score:
|
1943
|
-
best_score=score_test
|
1944
|
-
best_C=Cvalue
|
1945
|
-
best_model=lr
|
1946
|
-
|
1947
|
-
for Cvalue in np.arange(10,100,1):
|
1948
|
-
lr=LinearSVC(C=Cvalue, penalty='l2',dual=False,max_iter=10**6)
|
1949
|
-
lr.fit(X_train,y_train)
|
1950
|
-
score_test = lr.score(X_test, y_test)
|
1951
|
-
if score_test > best_score:
|
1952
|
-
best_score=score_test
|
1953
|
-
best_C=Cvalue
|
1954
|
-
best_model=lr
|
1955
|
-
|
1956
|
-
for Cvalue in np.arange(100,1000,10):
|
1957
|
-
lr=LinearSVC(C=Cvalue, penalty='l2',dual=False,max_iter=10**6)
|
1958
|
-
lr.fit(X_train,y_train)
|
1959
|
-
score_test = lr.score(X_test, y_test)
|
1960
|
-
if score_test > best_score:
|
1961
|
-
best_score=score_test
|
1962
|
-
best_C=Cvalue
|
1963
|
-
best_model=lr
|
1964
|
-
|
1965
|
-
score_train=best_model.score(X_train,y_train)
|
1966
|
-
return best_model,best_C,score_train,best_score
|
1967
|
-
|
1968
|
-
|
1969
|
-
if __name__=='__main__':
|
1970
|
-
ticker='MSFT'
|
1971
|
-
market='^GSPC'
|
1972
|
-
dfprice=get_price(ticker,'4/12/2019','1/1/2015')
|
1973
|
-
dfindex=get_price(market,'4/12/2019','1/1/2015')
|
1974
|
-
|
1975
|
-
X,y,ndfeature=make_advice_sample(dfprice,dfindex,5,240,20,'AR')
|
1976
|
-
model,C,score_train,score_test=bestLSVCL2(X,y)
|
1977
|
-
print("%.5f, %.5f, %.5f"%(C,score_train,score_test))
|
1978
|
-
#结果:4, 0.65000, 0.68333
|
1979
|
-
|
1980
|
-
X_new=ndfeature[0,0:20]
|
1981
|
-
y_new=model.predict(X_new)
|
1982
|
-
print("%.0f"%y_new)
|
1983
|
-
#结果:-1
|
1984
|
-
|
1985
|
-
X1,y1,ndfeature1=make_advice_sample(dfprice,dfindex,5,240,20,'JG')
|
1986
|
-
model,C,score_train,score_test=bestLSVCL2(X1,y1)
|
1987
|
-
print("%.5f, %.5f, %.5f"%(C,score_train,score_test))
|
1988
|
-
#结果:0.01, 0.63333, 0.58333
|
1989
|
-
|
1990
|
-
|
1991
|
-
#==============================================================================
|
1992
|
-
|
1993
|
-
def bestMODEL(dfprice,dfindex,n_nextdays=5, \
|
1994
|
-
n_samples=240,n_features=20, n_shares=1):
|
1995
|
-
"""
|
1996
|
-
功能:给定投资天数,样本构造参数,求最佳C值和预测投资结果
|
1997
|
-
"""
|
1998
|
-
|
1999
|
-
#样本构造方法:samplingtype='AR'
|
2000
|
-
best_score=0.1
|
2001
|
-
|
2002
|
-
#构造样本:AR, JG
|
2003
|
-
X,y,ndfeature=make_advice_sample(dfprice,dfindex,n_nextdays, \
|
2004
|
-
n_samples,n_features,'AR',n_shares)
|
2005
|
-
X1,y1,ndfeature1=make_advice_sample(dfprice,dfindex,n_nextdays, \
|
2006
|
-
n_samples,n_features,'JG',n_shares)
|
2007
|
-
|
2008
|
-
#测试LRL1优化策略
|
2009
|
-
model,C,score_train,score_test=bestLRL1(X,y)
|
2010
|
-
if score_test > best_score:
|
2011
|
-
bestmodel=model
|
2012
|
-
bestC=C
|
2013
|
-
best_train=score_train
|
2014
|
-
best_score=score_test
|
2015
|
-
besttype='AR'
|
2016
|
-
beststrategy='LRL1'
|
2017
|
-
|
2018
|
-
model,C,score_train,score_test=bestLRL1(X1,y1)
|
2019
|
-
if score_test > best_score:
|
2020
|
-
bestmodel=model
|
2021
|
-
bestC=C
|
2022
|
-
best_train=score_train
|
2023
|
-
best_score=score_test
|
2024
|
-
besttype='JG'
|
2025
|
-
beststrategy='LRL1'
|
2026
|
-
|
2027
|
-
|
2028
|
-
#测试LRL2优化策略
|
2029
|
-
model,C,score_train,score_test=bestLRL2(X,y)
|
2030
|
-
if score_test > best_score:
|
2031
|
-
bestmodel=model
|
2032
|
-
bestC=C
|
2033
|
-
best_train=score_train
|
2034
|
-
best_score=score_test
|
2035
|
-
besttype='AR'
|
2036
|
-
beststrategy='LRL2'
|
2037
|
-
|
2038
|
-
model,C,score_train,score_test=bestLRL2(X1,y1)
|
2039
|
-
if score_test > best_score:
|
2040
|
-
bestmodel=model
|
2041
|
-
bestC=C
|
2042
|
-
best_train=score_train
|
2043
|
-
best_score=score_test
|
2044
|
-
besttype='JG'
|
2045
|
-
beststrategy='LRL2'
|
2046
|
-
|
2047
|
-
#测试LSVCL1优化策略
|
2048
|
-
model,C,score_train,score_test=bestLSVCL1(X,y)
|
2049
|
-
if score_test > best_score:
|
2050
|
-
bestmodel=model
|
2051
|
-
bestC=C
|
2052
|
-
best_train=score_train
|
2053
|
-
best_score=score_test
|
2054
|
-
besttype='AR'
|
2055
|
-
beststrategy='LSVCL1'
|
2056
|
-
|
2057
|
-
model,C,score_train,score_test=bestLSVCL1(X1,y1)
|
2058
|
-
if score_test > best_score:
|
2059
|
-
bestmodel=model
|
2060
|
-
bestC=C
|
2061
|
-
best_train=score_train
|
2062
|
-
best_score=score_test
|
2063
|
-
besttype='JG'
|
2064
|
-
beststrategy='LSVCL1'
|
2065
|
-
|
2066
|
-
#测试LSVCL2优化策略
|
2067
|
-
model,C,score_train,score_test=bestLSVCL2(X,y)
|
2068
|
-
if score_test > best_score:
|
2069
|
-
bestmodel=model
|
2070
|
-
bestC=C
|
2071
|
-
best_train=score_train
|
2072
|
-
best_score=score_test
|
2073
|
-
besttype='AR'
|
2074
|
-
beststrategy='LSVCL2'
|
2075
|
-
|
2076
|
-
model,C,score_train,score_test=bestLSVCL2(X1,y1)
|
2077
|
-
if score_test > best_score:
|
2078
|
-
bestmodel=model
|
2079
|
-
bestC=C
|
2080
|
-
best_train=score_train
|
2081
|
-
best_score=score_test
|
2082
|
-
besttype='JG'
|
2083
|
-
beststrategy='LSVCL2'
|
2084
|
-
|
2085
|
-
print(" ***Model settings***")
|
2086
|
-
print(" Future days :",n_nextdays)
|
2087
|
-
print(" Number of samples :",n_samples)
|
2088
|
-
print(" Number of features:",n_features)
|
2089
|
-
print(" ***Best model specification***")
|
2090
|
-
print(" Model :",beststrategy)
|
2091
|
-
print(" Sampling type :",besttype)
|
2092
|
-
print(" C value : %.2f"%bestC)
|
2093
|
-
print(" Score on train : %.4f"%best_train)
|
2094
|
-
print(" Score on test : %.4f"%best_score)
|
2095
|
-
|
2096
|
-
ndf=ndfeature
|
2097
|
-
if besttype == 'JG': ndf=ndfeature1
|
2098
|
-
|
2099
|
-
return bestmodel,beststrategy,bestC,besttype,score_train,best_score,ndf
|
2100
|
-
|
2101
|
-
|
2102
|
-
if __name__=='__main__':
|
2103
|
-
ticker='MSFT'
|
2104
|
-
market='^GSPC'
|
2105
|
-
dfprice=get_price(ticker,'4/12/2019','1/1/2015')
|
2106
|
-
dfindex=get_price(market,'4/12/2019','1/1/2015')
|
2107
|
-
|
2108
|
-
n_days=5
|
2109
|
-
n_samples=240
|
2110
|
-
n_features=20
|
2111
|
-
model,strategy,C,ntype,score_train,score_test,ndfeature= \
|
2112
|
-
bestMODEL(dfprice,dfindex,n_days,n_samples,n_features)
|
2113
|
-
#结果:LRL1 AR 14.00, 0.6333, 0.7167
|
2114
|
-
|
2115
|
-
X_new=ndfeature[0,0:n_features]
|
2116
|
-
y_new=model.predict(X_new)
|
2117
|
-
print(y_new[0])
|
2118
|
-
#结果:-1
|
2119
|
-
|
2120
|
-
ticker='BILI'
|
2121
|
-
dfprice=get_price(ticker,'4/12/2019','1/1/2015')
|
2122
|
-
|
2123
|
-
n_days=20
|
2124
|
-
n_samples=120
|
2125
|
-
n_features=40
|
2126
|
-
model,strategy,C,ntype,score_train,score_test,ndfeature= \
|
2127
|
-
bestMODEL(dfprice,dfindex,n_days,n_samples,n_features)
|
2128
|
-
"""
|
2129
|
-
#结果:'
|
2130
|
-
***Model settings***
|
2131
|
-
Stock : BILI
|
2132
|
-
Future days : 20
|
2133
|
-
Number of samples : 120
|
2134
|
-
Number of features: 40
|
2135
|
-
***Best model specification***
|
2136
|
-
Model : LRL1
|
2137
|
-
Sampling type : AR
|
2138
|
-
C value : 2.00
|
2139
|
-
Score on train : 0.7111
|
2140
|
-
Score on test : 0.7000
|
2141
|
-
"""
|
2142
|
-
|
2143
|
-
X_new=ndfeature[0,0:n_features]
|
2144
|
-
y_new=model.predict(X_new)
|
2145
|
-
print(y_new[0])
|
2146
|
-
#结果:1
|
2147
|
-
|
2148
|
-
|
2149
|
-
ticker='PDD'
|
2150
|
-
dfprice=get_price(ticker,'4/12/2019','1/1/2015')
|
2151
|
-
|
2152
|
-
n_days=30
|
2153
|
-
n_samples=120
|
2154
|
-
n_features=30
|
2155
|
-
model,strategy,C,ntype,score_train,score_test,ndfeature= \
|
2156
|
-
bestMODEL(dfprice,dfindex,n_days,n_samples,n_features)
|
2157
|
-
"""
|
2158
|
-
#结果:
|
2159
|
-
***Model settings***
|
2160
|
-
Stock : PDD
|
2161
|
-
Future days : 30
|
2162
|
-
Number of samples : 120
|
2163
|
-
Number of features: 30
|
2164
|
-
***Best model specification***
|
2165
|
-
Model : LRL2
|
2166
|
-
Sampling type : AR
|
2167
|
-
C value : 0.21
|
2168
|
-
Score on train : 0.7667
|
2169
|
-
Score on test : 0.7333
|
2170
|
-
"""
|
2171
|
-
|
2172
|
-
X_new=ndfeature[0,0:n_features]
|
2173
|
-
y_new=model.predict(X_new)
|
2174
|
-
print(y_new[0])
|
2175
|
-
#结果:1
|
2176
|
-
|
2177
|
-
|
2178
|
-
ticker='BABA'
|
2179
|
-
dfprice=get_price(ticker,'4/12/2019','1/1/2015')
|
2180
|
-
|
2181
|
-
n_days=20
|
2182
|
-
n_samples=120
|
2183
|
-
n_features=40
|
2184
|
-
model,strategy,C,ntype,score_train,score_test,ndfeature= \
|
2185
|
-
bestMODEL(dfprice,dfindex,n_days,n_samples,n_features)
|
2186
|
-
"""
|
2187
|
-
#结果:
|
2188
|
-
***Model settings***
|
2189
|
-
Future days : 20
|
2190
|
-
Number of samples : 120
|
2191
|
-
Number of features: 40
|
2192
|
-
***Best model specification***
|
2193
|
-
Model : LSVCL1
|
2194
|
-
Sampling type : JG
|
2195
|
-
C value : 0.26
|
2196
|
-
Score on train : 0.8111
|
2197
|
-
Score on test : 0.8000
|
2198
|
-
"""
|
2199
|
-
|
2200
|
-
X_new=ndfeature[0,0:n_features]
|
2201
|
-
y_new=model.predict(X_new)
|
2202
|
-
print(y_new[0])
|
2203
|
-
#结果:1
|
2204
|
-
|
2205
|
-
|
2206
|
-
dfprice=get_portfolio(['BABA','BILI','PDD'],[0.5,0.33,0.17],'4/12/2019','1/1/2015')
|
2207
|
-
model,strategy,C,ntype,score_train,score_test,ndfeature= \
|
2208
|
-
bestMODEL(dfprice,dfindex,30, 80,60)
|
2209
|
-
"""
|
2210
|
-
结果:
|
2211
|
-
***Model settings***
|
2212
|
-
Future days : 30
|
2213
|
-
Number of samples : 80
|
2214
|
-
Number of features: 60
|
2215
|
-
***Best model specification***
|
2216
|
-
Model : LRL1
|
2217
|
-
Sampling type : AR
|
2218
|
-
C value : 1.30
|
2219
|
-
Score on train : 0.9333
|
2220
|
-
Score on test : 0.9500
|
2221
|
-
"""
|
2222
|
-
X_new=ndfeature[0,0:60]
|
2223
|
-
y_new=model.predict(X_new)
|
2224
|
-
print(y_new[0])
|
2225
|
-
#结果:1
|
2226
|
-
|
2227
|
-
#==============================================================================
|
2228
|
-
|
2229
|
-
def bestMODEL2(dfprice,dfindex,n_nextdays=10,n_shares=1):
|
2230
|
-
"""
|
2231
|
-
功能:给定投资天数,寻找最佳样本构造参数,求最佳C值和预测投资结果
|
2232
|
-
"""
|
2233
|
-
print("\nLooking for best numbers of samples and features, please wait...")
|
2234
|
-
|
2235
|
-
best_score=0.1
|
2236
|
-
import numpy as np
|
2237
|
-
|
2238
|
-
for f in np.arange(20,60,10):
|
2239
|
-
for s in np.arange(120,240,120):
|
2240
|
-
model,strategy,C,ntype,score_train,score_test= \
|
2241
|
-
bestMODEL(dfprice,dfindex,n_days,s,f)
|
2242
|
-
if score_test > best_score:
|
2243
|
-
bestmodel=model
|
2244
|
-
bestsamples=s
|
2245
|
-
bestfeatures=f
|
2246
|
-
beststrategy=strategy
|
2247
|
-
bestC=C
|
2248
|
-
besttype=ntype
|
2249
|
-
best_train=score_train
|
2250
|
-
best_score=score_test
|
2251
|
-
|
2252
|
-
print(" ***Model settings")
|
2253
|
-
print(" Future days :",n_nextdays)
|
2254
|
-
print(" ***Best model specification")
|
2255
|
-
print(" Model :",beststrategy)
|
2256
|
-
print(" Sampling type :",besttype)
|
2257
|
-
print(" Number of samples :",bestsamples)
|
2258
|
-
print(" Number of features:",bestfeatures)
|
2259
|
-
print(" C value :%.2f"%bestC)
|
2260
|
-
print(" Score on train :%.4f"%best_train)
|
2261
|
-
print(" Score on test :%.4f"%best_score)
|
2262
|
-
|
2263
|
-
return bestmodel,bestsamples,bestfeatures,beststrategy,bestC,besttype, \
|
2264
|
-
score_train,best_score
|
2265
|
-
|
2266
|
-
|
2267
|
-
if __name__=='__main__':
|
2268
|
-
ticker='MSFT'
|
2269
|
-
market='^GSPC'
|
2270
|
-
dfprice=get_price(ticker,'4/12/2019','1/1/2015')
|
2271
|
-
dfindex=get_price(market,'4/12/2019','1/1/2015')
|
2272
|
-
|
2273
|
-
n_days=5
|
2274
|
-
model,samples,features,strategy,C,ntype,score_train,best_score= \
|
2275
|
-
bestMODEL2(dfprice,dfindex,n_days)
|
2276
|
-
#结果:120 30 LRL1 AR 18.00, 0.7079, 0.8
|
2277
|
-
|
2278
|
-
|
2279
|
-
X,y,ndfeature=make_advice_sample(dfprice,dfindex,n_days, \
|
2280
|
-
samples,features,ntype)
|
2281
|
-
X_new=ndfeature[0,0:n_features]
|
2282
|
-
y_new=model.predict(X_new)
|
2283
|
-
print(y_new[0])
|
2284
|
-
#结果:-1
|
2285
|
-
|
2286
|
-
|
2287
|
-
#==============================================================================#==============================================================================
|
2288
|
-
#==============================================================================#==============================================================================
|
2289
|
-
#==============================================================================#==============================================================================
|
2290
|
-
|
2291
|
-
#==============================================================================
|