siat 3.10.132__py3-none-any.whl → 3.10.133__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- siat/__init__.py +0 -0
- siat/allin.py +0 -0
- siat/assets_liquidity.py +0 -0
- siat/beta_adjustment.py +0 -0
- siat/beta_adjustment_china.py +0 -0
- siat/blockchain.py +0 -0
- siat/bond.py +0 -0
- siat/bond_base.py +0 -0
- siat/bond_china.py +0 -0
- siat/bond_zh_sina.py +0 -0
- siat/capm_beta.py +0 -0
- siat/capm_beta2.py +0 -0
- siat/compare_cross.py +0 -0
- siat/copyrights.py +0 -0
- siat/cryptocurrency.py +0 -0
- siat/economy.py +0 -0
- siat/economy2.py +0 -0
- siat/esg.py +0 -0
- siat/event_study.py +0 -0
- siat/exchange_bond_china.pickle +0 -0
- siat/fama_french.py +0 -0
- siat/fin_stmt2_yahoo.py +0 -0
- siat/financial_base.py +0 -0
- siat/financial_statements.py +0 -0
- siat/financials.py +0 -0
- siat/financials2.py +0 -0
- siat/financials_china.py +0 -0
- siat/financials_china2.py +0 -0
- siat/fund.py +0 -0
- siat/fund_china.pickle +0 -0
- siat/fund_china.py +0 -0
- siat/future_china.py +0 -0
- siat/google_authenticator.py +0 -0
- siat/grafix.py +0 -0
- siat/holding_risk.py +0 -0
- siat/luchy_draw.py +0 -0
- siat/market_china.py +0 -0
- siat/markowitz.py +0 -0
- siat/markowitz2.py +0 -0
- siat/markowitz2_20250704.py +0 -0
- siat/markowitz2_20250705.py +0 -0
- siat/markowitz_simple.py +0 -0
- siat/ml_cases.py +0 -0
- siat/ml_cases_example.py +0 -0
- siat/option_china.py +0 -0
- siat/option_pricing.py +0 -0
- siat/other_indexes.py +0 -0
- siat/risk_adjusted_return.py +0 -0
- siat/risk_adjusted_return2.py +0 -0
- siat/risk_evaluation.py +0 -0
- siat/risk_free_rate.py +0 -0
- siat/sector_china.py +0 -0
- siat/security_price2.py +0 -0
- siat/security_prices.py +40 -2
- siat/security_trend.py +0 -0
- siat/security_trend2.py +0 -0
- siat/stock.py +0 -0
- siat/stock_advice_linear.py +0 -0
- siat/stock_base.py +0 -0
- siat/stock_china.py +0 -0
- siat/stock_info.pickle +0 -0
- siat/stock_prices_kneighbors.py +0 -0
- siat/stock_prices_linear.py +0 -0
- siat/stock_profile.py +0 -0
- siat/stock_technical.py +0 -0
- siat/stooq.py +0 -0
- siat/transaction.py +0 -0
- siat/translate.py +0 -0
- siat/valuation.py +0 -0
- siat/valuation_china.py +0 -0
- siat/var_model_validation.py +0 -0
- siat/yf_name.py +0 -0
- {siat-3.10.132.dist-info/licenses → siat-3.10.133.dist-info}/LICENSE +0 -0
- {siat-3.10.132.dist-info → siat-3.10.133.dist-info}/METADATA +232 -235
- siat-3.10.133.dist-info/RECORD +78 -0
- {siat-3.10.132.dist-info → siat-3.10.133.dist-info}/WHEEL +1 -1
- {siat-3.10.132.dist-info → siat-3.10.133.dist-info}/top_level.txt +0 -1
- build/lib/build/lib/siat/__init__.py +0 -75
- build/lib/build/lib/siat/allin.py +0 -137
- build/lib/build/lib/siat/assets_liquidity.py +0 -915
- build/lib/build/lib/siat/beta_adjustment.py +0 -1058
- build/lib/build/lib/siat/beta_adjustment_china.py +0 -548
- build/lib/build/lib/siat/blockchain.py +0 -143
- build/lib/build/lib/siat/bond.py +0 -2900
- build/lib/build/lib/siat/bond_base.py +0 -992
- build/lib/build/lib/siat/bond_china.py +0 -100
- build/lib/build/lib/siat/bond_zh_sina.py +0 -143
- build/lib/build/lib/siat/capm_beta.py +0 -783
- build/lib/build/lib/siat/capm_beta2.py +0 -887
- build/lib/build/lib/siat/common.py +0 -5360
- build/lib/build/lib/siat/compare_cross.py +0 -642
- build/lib/build/lib/siat/copyrights.py +0 -18
- build/lib/build/lib/siat/cryptocurrency.py +0 -667
- build/lib/build/lib/siat/economy.py +0 -1471
- build/lib/build/lib/siat/economy2.py +0 -1853
- build/lib/build/lib/siat/esg.py +0 -536
- build/lib/build/lib/siat/event_study.py +0 -815
- build/lib/build/lib/siat/fama_french.py +0 -1521
- build/lib/build/lib/siat/fin_stmt2_yahoo.py +0 -982
- build/lib/build/lib/siat/financial_base.py +0 -1160
- build/lib/build/lib/siat/financial_statements.py +0 -598
- build/lib/build/lib/siat/financials.py +0 -2339
- build/lib/build/lib/siat/financials2.py +0 -1278
- build/lib/build/lib/siat/financials_china.py +0 -4433
- build/lib/build/lib/siat/financials_china2.py +0 -2212
- build/lib/build/lib/siat/fund.py +0 -629
- build/lib/build/lib/siat/fund_china.py +0 -3307
- build/lib/build/lib/siat/future_china.py +0 -551
- build/lib/build/lib/siat/google_authenticator.py +0 -47
- build/lib/build/lib/siat/grafix.py +0 -3636
- build/lib/build/lib/siat/holding_risk.py +0 -867
- build/lib/build/lib/siat/luchy_draw.py +0 -638
- build/lib/build/lib/siat/market_china.py +0 -1168
- build/lib/build/lib/siat/markowitz.py +0 -2363
- build/lib/build/lib/siat/markowitz2.py +0 -3150
- build/lib/build/lib/siat/markowitz2_20250704.py +0 -2969
- build/lib/build/lib/siat/markowitz2_20250705.py +0 -3158
- build/lib/build/lib/siat/markowitz_simple.py +0 -373
- build/lib/build/lib/siat/ml_cases.py +0 -2291
- build/lib/build/lib/siat/ml_cases_example.py +0 -60
- build/lib/build/lib/siat/option_china.py +0 -3069
- build/lib/build/lib/siat/option_pricing.py +0 -1925
- build/lib/build/lib/siat/other_indexes.py +0 -409
- build/lib/build/lib/siat/risk_adjusted_return.py +0 -1576
- build/lib/build/lib/siat/risk_adjusted_return2.py +0 -1900
- build/lib/build/lib/siat/risk_evaluation.py +0 -2218
- build/lib/build/lib/siat/risk_free_rate.py +0 -351
- build/lib/build/lib/siat/sector_china.py +0 -4140
- build/lib/build/lib/siat/security_price2.py +0 -727
- build/lib/build/lib/siat/security_prices.py +0 -3408
- build/lib/build/lib/siat/security_trend.py +0 -402
- build/lib/build/lib/siat/security_trend2.py +0 -646
- build/lib/build/lib/siat/stock.py +0 -4284
- build/lib/build/lib/siat/stock_advice_linear.py +0 -934
- build/lib/build/lib/siat/stock_base.py +0 -26
- build/lib/build/lib/siat/stock_china.py +0 -2095
- build/lib/build/lib/siat/stock_prices_kneighbors.py +0 -910
- build/lib/build/lib/siat/stock_prices_linear.py +0 -386
- build/lib/build/lib/siat/stock_profile.py +0 -707
- build/lib/build/lib/siat/stock_technical.py +0 -3305
- build/lib/build/lib/siat/stooq.py +0 -74
- build/lib/build/lib/siat/transaction.py +0 -347
- build/lib/build/lib/siat/translate.py +0 -5183
- build/lib/build/lib/siat/valuation.py +0 -1378
- build/lib/build/lib/siat/valuation_china.py +0 -2076
- build/lib/build/lib/siat/var_model_validation.py +0 -444
- build/lib/build/lib/siat/yf_name.py +0 -811
- build/lib/siat/__init__.py +0 -75
- build/lib/siat/allin.py +0 -137
- build/lib/siat/assets_liquidity.py +0 -915
- build/lib/siat/beta_adjustment.py +0 -1058
- build/lib/siat/beta_adjustment_china.py +0 -548
- build/lib/siat/blockchain.py +0 -143
- build/lib/siat/bond.py +0 -2900
- build/lib/siat/bond_base.py +0 -992
- build/lib/siat/bond_china.py +0 -100
- build/lib/siat/bond_zh_sina.py +0 -143
- build/lib/siat/capm_beta.py +0 -783
- build/lib/siat/capm_beta2.py +0 -887
- build/lib/siat/common.py +0 -5360
- build/lib/siat/compare_cross.py +0 -642
- build/lib/siat/copyrights.py +0 -18
- build/lib/siat/cryptocurrency.py +0 -667
- build/lib/siat/economy.py +0 -1471
- build/lib/siat/economy2.py +0 -1853
- build/lib/siat/esg.py +0 -536
- build/lib/siat/event_study.py +0 -815
- build/lib/siat/fama_french.py +0 -1521
- build/lib/siat/fin_stmt2_yahoo.py +0 -982
- build/lib/siat/financial_base.py +0 -1160
- build/lib/siat/financial_statements.py +0 -598
- build/lib/siat/financials.py +0 -2339
- build/lib/siat/financials2.py +0 -1278
- build/lib/siat/financials_china.py +0 -4433
- build/lib/siat/financials_china2.py +0 -2212
- build/lib/siat/fund.py +0 -629
- build/lib/siat/fund_china.py +0 -3307
- build/lib/siat/future_china.py +0 -551
- build/lib/siat/google_authenticator.py +0 -47
- build/lib/siat/grafix.py +0 -3636
- build/lib/siat/holding_risk.py +0 -867
- build/lib/siat/luchy_draw.py +0 -638
- build/lib/siat/market_china.py +0 -1168
- build/lib/siat/markowitz.py +0 -2363
- build/lib/siat/markowitz2.py +0 -3150
- build/lib/siat/markowitz2_20250704.py +0 -2969
- build/lib/siat/markowitz2_20250705.py +0 -3158
- build/lib/siat/markowitz_simple.py +0 -373
- build/lib/siat/ml_cases.py +0 -2291
- build/lib/siat/ml_cases_example.py +0 -60
- build/lib/siat/option_china.py +0 -3069
- build/lib/siat/option_pricing.py +0 -1925
- build/lib/siat/other_indexes.py +0 -409
- build/lib/siat/risk_adjusted_return.py +0 -1576
- build/lib/siat/risk_adjusted_return2.py +0 -1900
- build/lib/siat/risk_evaluation.py +0 -2218
- build/lib/siat/risk_free_rate.py +0 -351
- build/lib/siat/sector_china.py +0 -4140
- build/lib/siat/security_price2.py +0 -727
- build/lib/siat/security_prices.py +0 -3408
- build/lib/siat/security_trend.py +0 -402
- build/lib/siat/security_trend2.py +0 -646
- build/lib/siat/stock.py +0 -4284
- build/lib/siat/stock_advice_linear.py +0 -934
- build/lib/siat/stock_base.py +0 -26
- build/lib/siat/stock_china.py +0 -2095
- build/lib/siat/stock_prices_kneighbors.py +0 -910
- build/lib/siat/stock_prices_linear.py +0 -386
- build/lib/siat/stock_profile.py +0 -707
- build/lib/siat/stock_technical.py +0 -3305
- build/lib/siat/stooq.py +0 -74
- build/lib/siat/transaction.py +0 -347
- build/lib/siat/translate.py +0 -5183
- build/lib/siat/valuation.py +0 -1378
- build/lib/siat/valuation_china.py +0 -2076
- build/lib/siat/var_model_validation.py +0 -444
- build/lib/siat/yf_name.py +0 -811
- siat-3.10.132.dist-info/RECORD +0 -218
@@ -1,910 +0,0 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
"""
|
3
|
-
@function: 预测美股股价,教学演示用,其他用途责任自负,仅适用于机器学习课堂案例演示
|
4
|
-
@version:v1.4,2020.4.6
|
5
|
-
@purpose: 机器学习课程案例
|
6
|
-
@author: 王德宏,北京外国语大学国际商学院
|
7
|
-
特点:加入了多进程并行处理mp,在多核电脑上可以明显提速
|
8
|
-
"""
|
9
|
-
|
10
|
-
#==============================================================================
|
11
|
-
import warnings; warnings.filterwarnings('ignore')
|
12
|
-
#==============================================================================
|
13
|
-
def get_stock_price(ticker,atdate,fromdate):
|
14
|
-
"""
|
15
|
-
功能:抓取股票收盘价
|
16
|
-
输出:指定股票的收盘价格序列,最新日期的股价排列在前
|
17
|
-
ticker:股票代码
|
18
|
-
atdate:当前日期,既可以是今天日期,也可以是一个历史日期,datetime类型
|
19
|
-
fromdate:样本开始日期,尽量远的日期,以便取得足够多的原始样本,类型同atdate
|
20
|
-
"""
|
21
|
-
#抓取股票价格
|
22
|
-
try:
|
23
|
-
from pandas_datareader import data
|
24
|
-
except:
|
25
|
-
print(".Error(get_stock_price), pls install pandas_datareader first!")
|
26
|
-
return None
|
27
|
-
try:
|
28
|
-
price=data.DataReader(ticker,'yahoo',fromdate,atdate)
|
29
|
-
except:
|
30
|
-
print(".Error(get_stock_price), failed to capture stock prices:",ticker,fromdate,atdate)
|
31
|
-
return None
|
32
|
-
#去掉比起始日期更早的样本
|
33
|
-
#price=price[price.index >= fromdate]
|
34
|
-
|
35
|
-
#按日期降序排序,近期的价格排在前面
|
36
|
-
sortedprice=price.sort_index(axis=0,ascending=False)
|
37
|
-
|
38
|
-
#提取日期和星期几
|
39
|
-
sortedprice['Date']=sortedprice.index.strftime("%Y-%m-%d")
|
40
|
-
sortedprice['Weekday']=sortedprice.index.weekday+1
|
41
|
-
|
42
|
-
#生成输出数据格式:日期,星期几,收盘价
|
43
|
-
dfprice=sortedprice[['Date','Weekday','Close']]
|
44
|
-
|
45
|
-
return dfprice
|
46
|
-
|
47
|
-
|
48
|
-
if __name__=='__main__':
|
49
|
-
ticker='MSFT'
|
50
|
-
atdate='4/2/2020'
|
51
|
-
fromdate='1/1/2015'
|
52
|
-
dfprice=get_stock_price('MSFT','4/2/2020','1/1/2015')
|
53
|
-
dfprice.head(5)
|
54
|
-
dfprice.tail(3)
|
55
|
-
dfprice[dfprice.Date == '2019-03-29']
|
56
|
-
dfprice[(dfprice.Date>='2019-03-20') & (dfprice.Date<='2019-03-29')]
|
57
|
-
|
58
|
-
|
59
|
-
#==============================================================================
|
60
|
-
def make_price_sample(dfprice,n_nextdays=1,n_samples=252,n_features=21):
|
61
|
-
"""
|
62
|
-
功能:生成指定股票的价格样本
|
63
|
-
ticker:股票代码
|
64
|
-
n_nextdays:预测从atdate开始未来第几天的股价,默认为1
|
65
|
-
n_samples:需要生成的样本个数,默认252个(一年的平均交易天数)
|
66
|
-
n_features:使用的特征数量,默认21个(一个月的平均交易天数)
|
67
|
-
"""
|
68
|
-
#检查样本数量是否够用
|
69
|
-
n_req=n_features+n_nextdays+n_samples
|
70
|
-
if len(dfprice) < n_req:
|
71
|
-
print(".Error(make_price_sample), need more number of stock prices!")
|
72
|
-
print("...There are only",len(dfprice),"obs in the stock price file")
|
73
|
-
print("...But, I need at least",n_req,"obs to make ML samples")
|
74
|
-
return None,None,None
|
75
|
-
|
76
|
-
#提取收盘价,Series类型
|
77
|
-
closeprice=dfprice.Close
|
78
|
-
|
79
|
-
#转换为机器学习需要的ndarray类型
|
80
|
-
import numpy as np
|
81
|
-
ndprice=np.asmatrix(closeprice,dtype=None)
|
82
|
-
|
83
|
-
#生成第一个标签样本:标签矩阵y(形状:n_samples x 1)
|
84
|
-
y=np.asmatrix(ndprice[0,0])
|
85
|
-
#生成第一个特征样本:特征矩阵X(形状:n_samples x n_features)
|
86
|
-
X=ndprice[0,n_nextdays:n_features+n_nextdays]
|
87
|
-
|
88
|
-
#生成其余的标签样本和特征样本
|
89
|
-
for i in range(1,n_samples):
|
90
|
-
#加入到标签矩阵中
|
91
|
-
y_row=np.asmatrix(ndprice[0,i])
|
92
|
-
y=np.append(y,y_row,axis=0)
|
93
|
-
#加入到特征矩阵中
|
94
|
-
X_row=ndprice[0,(n_nextdays+i):(n_features+n_nextdays+i)]
|
95
|
-
X=np.append(X,X_row,axis=0)
|
96
|
-
|
97
|
-
return X,y,ndprice
|
98
|
-
|
99
|
-
if __name__=='__main__':
|
100
|
-
dfprice=get_stock_price('LK','4/3/2020','1/1/2015')
|
101
|
-
X,y,ndprice=make_price_sample(dfprice,1,200,21)
|
102
|
-
y[:5]
|
103
|
-
y[2:5] #第1行的序号为0
|
104
|
-
X[:5]
|
105
|
-
X[:-5]
|
106
|
-
X[3-1,2-1]
|
107
|
-
|
108
|
-
|
109
|
-
#==============================================================================
|
110
|
-
def bestKN(X,y,maxk=10,random_state=0):
|
111
|
-
"""
|
112
|
-
功能:给定特征矩阵和标签,返回最优的邻居个数(默认最大为10)和模型
|
113
|
-
最优策略:测试集分数最高,不管过拟合问题
|
114
|
-
"""
|
115
|
-
#随机分割样本为训练集和测试集
|
116
|
-
from sklearn.model_selection import train_test_split
|
117
|
-
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=random_state)
|
118
|
-
|
119
|
-
#引用k近邻模型的预测器(Regressor)
|
120
|
-
from sklearn.neighbors import KNeighborsRegressor
|
121
|
-
bestk=1
|
122
|
-
reg=KNeighborsRegressor(n_neighbors=bestk,weights='distance',n_jobs=-1)
|
123
|
-
reg.fit(X_train,y_train)
|
124
|
-
bestmodel=reg
|
125
|
-
bestscore_train=bestmodel.score(X_train,y_train)
|
126
|
-
bestscore_test =bestmodel.score(X_test,y_test)
|
127
|
-
|
128
|
-
for k in range(2,(maxk+1)):
|
129
|
-
reg=KNeighborsRegressor(n_neighbors=k,weights='distance',n_jobs=-1)
|
130
|
-
reg.fit(X_train,y_train)
|
131
|
-
score_train=reg.score(X_train,y_train)
|
132
|
-
score_test =reg.score(X_test,y_test)
|
133
|
-
|
134
|
-
if score_test > bestscore_test:
|
135
|
-
bestk=k
|
136
|
-
bestscore_train=score_train
|
137
|
-
bestscore_test =score_test
|
138
|
-
bestmodel=reg
|
139
|
-
|
140
|
-
return bestmodel,bestk,bestscore_train,bestscore_test
|
141
|
-
|
142
|
-
if __name__=='__main__':
|
143
|
-
dfprice=get_stock_price('MSFT','4/3/2019','1/1/2015')
|
144
|
-
X,y,ndprice=make_price_sample(dfprice,1,240,20)
|
145
|
-
bestmodel,bestk,bestscore_train,bestscore_test=bestKN(X,y)
|
146
|
-
print(bestk,bestscore_train,bestscore_test)
|
147
|
-
|
148
|
-
|
149
|
-
#==============================================================================
|
150
|
-
def bestFN(dfprice,n_nextdays=1,n_samples=252,maxFN=252,random_state=0):
|
151
|
-
"""
|
152
|
-
功能:给定股价序列,试验最优的特征个数(默认最大为60)和模型
|
153
|
-
最优策略:测试集分数最高,不管过拟合问题
|
154
|
-
"""
|
155
|
-
#试验起点:1个特征个数
|
156
|
-
bestf=1
|
157
|
-
X,y,ndprice=make_price_sample(dfprice,n_nextdays,n_samples,bestf)
|
158
|
-
#测试给定特征个数时的最优邻居个数
|
159
|
-
bestmodel,bestk,bestscore_train,bestscore_test=bestKN(X,y,random_state=random_state)
|
160
|
-
|
161
|
-
#特征个数增长的步长
|
162
|
-
n_step=1
|
163
|
-
for f in range(2,maxFN+1,n_step):
|
164
|
-
if len(dfprice) < (n_nextdays+n_samples+f): break
|
165
|
-
X,y,ndprice=make_price_sample(dfprice,n_nextdays,n_samples,f)
|
166
|
-
model,k,score_train,score_test=bestKN(X,y,random_state=random_state)
|
167
|
-
|
168
|
-
if score_test > bestscore_test:
|
169
|
-
bestf=f
|
170
|
-
bestk=k
|
171
|
-
bestscore_train=score_train
|
172
|
-
bestscore_test =score_test
|
173
|
-
bestmodel=model
|
174
|
-
|
175
|
-
#返回测试集效果最好的模型、特征个数、邻居个数、成绩
|
176
|
-
return bestmodel,bestf,bestk,bestscore_train,bestscore_test
|
177
|
-
|
178
|
-
if __name__=='__main__':
|
179
|
-
dfprice=get_stock_price('MSFT','4/4/2020','1/1/2015')
|
180
|
-
bestmodel,bestf,bestk,bestscore_train,bestscore_test= \
|
181
|
-
bestFN(dfprice,1,252)
|
182
|
-
|
183
|
-
print("best f=",bestf,",best k=",bestk, \
|
184
|
-
"\nbest score on train=",bestscore_train, \
|
185
|
-
"\nbest score on test=",bestscore_test)
|
186
|
-
|
187
|
-
|
188
|
-
#==============================================================================
|
189
|
-
def bestKN2(X,y,maxk=10,random_state=0):
|
190
|
-
"""
|
191
|
-
功能:给定特征矩阵和标签,返回最优的邻居个数(默认最大为10)和模型
|
192
|
-
最优策略2:训练集和测试集分数最接近,希望控制过拟合和欠拟合问题
|
193
|
-
"""
|
194
|
-
#随机分割样本为训练集和测试集
|
195
|
-
from sklearn.model_selection import train_test_split
|
196
|
-
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=random_state)
|
197
|
-
|
198
|
-
#引用k近邻模型的预测器(Regressor)
|
199
|
-
from sklearn.neighbors import KNeighborsRegressor
|
200
|
-
bestk=1
|
201
|
-
reg=KNeighborsRegressor(n_neighbors=bestk,weights='distance',n_jobs=-1)
|
202
|
-
reg.fit(X_train,y_train)
|
203
|
-
bestmodel=reg
|
204
|
-
bestscore_train=reg.score(X_train,y_train)
|
205
|
-
bestscore_test =reg.score(X_test,y_test)
|
206
|
-
|
207
|
-
import numpy as np
|
208
|
-
bestrate=np.abs(bestscore_train / bestscore_test -1)
|
209
|
-
|
210
|
-
for k in range(2,(maxk+1)):
|
211
|
-
reg=KNeighborsRegressor(n_neighbors=k,weights='distance',n_jobs=-1)
|
212
|
-
reg.fit(X_train,y_train)
|
213
|
-
score_train=reg.score(X_train,y_train)
|
214
|
-
score_test =reg.score(X_test,y_test)
|
215
|
-
rate=np.abs(score_train / score_test -1)
|
216
|
-
|
217
|
-
if rate < bestrate:
|
218
|
-
bestk=k
|
219
|
-
bestrate=rate
|
220
|
-
bestscore_train=score_train
|
221
|
-
bestscore_test =score_test
|
222
|
-
bestmodel=reg
|
223
|
-
return bestmodel,bestk,bestscore_train,bestscore_test,bestrate
|
224
|
-
|
225
|
-
if __name__=='__main__':
|
226
|
-
dfprice=get_stock_price('MSFT','3/27/2019','1/1/2015')
|
227
|
-
X,y,ndprice=make_price_sample(dfprice,1,252,21)
|
228
|
-
|
229
|
-
bestmodel,bestk,bestscore_train,bestscore_test=bestKN(X,y)
|
230
|
-
print("best k=",bestk,"\nbest score on train=",bestscore_train, \
|
231
|
-
",best score on test=",bestscore_test)
|
232
|
-
|
233
|
-
bestmodel,bestk,bestscore_train,bestscore_test,bestrate=bestKN2(X,y)
|
234
|
-
print("best k=",bestk,"\nbest score on train=",bestscore_train, \
|
235
|
-
",best score on test=",bestscore_test)
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
#==============================================================================
|
240
|
-
def bestFN2(dfprice,n_nextdays=1,n_samples=252,maxFN=252,random_state=0):
|
241
|
-
"""
|
242
|
-
功能:给定股价序列,试验最优的特征个数(默认最大为252)和模型
|
243
|
-
最优策略2:训练集和测试集分数最接近,希望控制过拟合和欠拟合问题
|
244
|
-
"""
|
245
|
-
#试验起点:1个特征个数
|
246
|
-
bestf=1
|
247
|
-
X,y,ndprice=make_price_sample(dfprice,n_nextdays,n_samples,bestf)
|
248
|
-
#测试给定特征个数时的最优邻居个数
|
249
|
-
bestmodel,bestk,bestscore_train,bestscore_test,bestrate=bestKN2(X,y,random_state=random_state)
|
250
|
-
|
251
|
-
#特征个数增长的步长
|
252
|
-
n_step=1
|
253
|
-
for f in range(2,maxFN+1,n_step):
|
254
|
-
if len(dfprice) < (n_nextdays+n_samples+f): break
|
255
|
-
X,y,ndprice=make_price_sample(dfprice,n_nextdays,n_samples,f)
|
256
|
-
model,k,score_train,score_test,rate=bestKN2(X,y,random_state=random_state)
|
257
|
-
|
258
|
-
if rate < bestrate:
|
259
|
-
bestf=f
|
260
|
-
bestk=k
|
261
|
-
bestscore_train=score_train
|
262
|
-
bestscore_test =score_test
|
263
|
-
bestrate=rate
|
264
|
-
bestmodel=model
|
265
|
-
|
266
|
-
#返回测试集效果最好的模型、特征个数、邻居个数、成绩
|
267
|
-
return bestmodel,bestf,bestk,bestscore_train,bestscore_test,bestrate
|
268
|
-
|
269
|
-
if __name__=='__main__':
|
270
|
-
dfprice=get_stock_price('MSFT','3/27/2019','1/1/2015')
|
271
|
-
bestmodel,bestf,bestk,bestscore_train,bestscore_test= \
|
272
|
-
bestFN(dfprice,1,252)
|
273
|
-
print("best f=",bestf,",best k=",bestk, \
|
274
|
-
"\nbest score on train=",bestscore_train, \
|
275
|
-
"\nbest score on test=",bestscore_test)
|
276
|
-
|
277
|
-
bestmodel,bestf,bestk,bestscore_train,bestscore_test= \
|
278
|
-
bestFN2(ndprice,1,252)
|
279
|
-
print("best f=",bestf,",best k=",bestk, \
|
280
|
-
"\nbest score on train=",bestscore_train, \
|
281
|
-
"\nbest score on test=",bestscore_test)
|
282
|
-
|
283
|
-
#==============================================================================
|
284
|
-
def isdate(adate):
|
285
|
-
"""
|
286
|
-
功能:根据日期的合理性
|
287
|
-
输入参数:
|
288
|
-
adate:日期。格式:YYYY-MM-DD
|
289
|
-
输出:无
|
290
|
-
返回:有效/无效日期(True/False)
|
291
|
-
"""
|
292
|
-
import pandas as pd
|
293
|
-
#测试开始日期的合理性
|
294
|
-
try: adatedt=pd.to_datetime(adate)
|
295
|
-
except: return False
|
296
|
-
else: return True
|
297
|
-
|
298
|
-
#==============================================================================
|
299
|
-
def date_adjust(basedate, adjust=0):
|
300
|
-
"""
|
301
|
-
功能:将给定日期向前或向后调整特定的天数
|
302
|
-
输入:基础日期,需要调整的天数。
|
303
|
-
basedate: 基础日期。
|
304
|
-
adjust:需要调整的天数,负数表示向前调整,正数表示向后调整。
|
305
|
-
输出:调整后的日期。
|
306
|
-
"""
|
307
|
-
#检查基础日期的合理性
|
308
|
-
import pandas as pd
|
309
|
-
try:
|
310
|
-
bd=pd.to_datetime(basedate)
|
311
|
-
except:
|
312
|
-
print("*** 错误#1(date_adjust),无效的日期:",basedate)
|
313
|
-
return None
|
314
|
-
|
315
|
-
#调整日期
|
316
|
-
from datetime import timedelta
|
317
|
-
nd = bd+timedelta(days=adjust)
|
318
|
-
|
319
|
-
#重新提取日期
|
320
|
-
newdate=nd.date()
|
321
|
-
return str(newdate)
|
322
|
-
|
323
|
-
if __name__ =="__main__":
|
324
|
-
basedate='2020-3-17'
|
325
|
-
adjust=-365
|
326
|
-
newdate = date_adjust(basedate, adjust)
|
327
|
-
print(newdate)
|
328
|
-
|
329
|
-
#==============================================================================
|
330
|
-
def forecast_stock_price(ticker,atdate,n_nextdays,n_samples=252, \
|
331
|
-
maxk=20,maxFN=252,random_state=0,printout=True):
|
332
|
-
"""
|
333
|
-
功能:预测未来第几天的股票收盘价,执行FN和FN2优化策略
|
334
|
-
"""
|
335
|
-
#检查日期的合理性
|
336
|
-
if not isdate(atdate):
|
337
|
-
print(".Error(forecast_stock_price), invalid date:",atdate)
|
338
|
-
return None
|
339
|
-
|
340
|
-
print("... Predicting stock price, it may take long time, please wait ......")
|
341
|
-
|
342
|
-
#设定起始日期:
|
343
|
-
nyears=int((n_nextdays + n_samples + maxFN + 1)/252)+2
|
344
|
-
start=date_adjust(atdate,-366*nyears)
|
345
|
-
|
346
|
-
#抓取股价数据
|
347
|
-
dfprice=get_stock_price(ticker,atdate,start)
|
348
|
-
if dfprice is None:
|
349
|
-
print(".Error(forecast_stock_price), failed to capture stock prices:",ticker)
|
350
|
-
return None
|
351
|
-
if len(dfprice) < (n_nextdays + n_samples + maxFN + 1):
|
352
|
-
print(".Error(forecast_stock_price), insufficient number of stock prices!")
|
353
|
-
return None
|
354
|
-
|
355
|
-
#生成机器学习样本1: 确定最佳特征个数bestf,不管过拟合/欠拟合问题
|
356
|
-
bestmodel1,bestf1,bestk1,bestscore_train1,bestscore_test1= \
|
357
|
-
bestFN(dfprice,n_nextdays,n_samples,random_state=random_state)
|
358
|
-
X,y,ndprice=make_price_sample(dfprice,n_nextdays,n_samples,bestf1)
|
359
|
-
|
360
|
-
#基于最新特征样本X_new,预测第n_nextdays的股价
|
361
|
-
X_new1=ndprice[0,0:bestf1]
|
362
|
-
y_new1=bestmodel1.predict(X_new1)
|
363
|
-
|
364
|
-
|
365
|
-
#生成机器学习样本2: 确定最佳特征个数bestf,考虑过拟合/欠拟合问题
|
366
|
-
bestmodel2,bestf2,bestk2,bestscore_train2,bestscore_test2,bestrate2= \
|
367
|
-
bestFN2(dfprice,n_nextdays,n_samples)
|
368
|
-
X,y,ndprice=make_price_sample(dfprice,n_nextdays,n_samples,bestf2)
|
369
|
-
X_new2=ndprice[0,0:bestf2]
|
370
|
-
y_new2=bestmodel2.predict(X_new2)
|
371
|
-
|
372
|
-
|
373
|
-
#最终决定:以最大测试成绩为优先
|
374
|
-
if bestscore_test1 <= bestscore_test2:
|
375
|
-
predicted_y=y_new2[0,0]
|
376
|
-
bestscore_train=bestscore_train2
|
377
|
-
bestscore_test=bestscore_test2
|
378
|
-
bestfeature=bestf2
|
379
|
-
bestk=bestk2
|
380
|
-
else:
|
381
|
-
predicted_y=y_new1[0,0]
|
382
|
-
bestscore_train=bestscore_train1
|
383
|
-
bestscore_test=bestscore_test1
|
384
|
-
bestfeature=bestf1
|
385
|
-
bestk=bestk1
|
386
|
-
if printout:
|
387
|
-
print(" Forecasted price:%10.2f" % predicted_y)
|
388
|
-
print(" Best score on train:",round(bestscore_train,4))
|
389
|
-
print(" Best score on test:",round(bestscore_test,4))
|
390
|
-
print(" Best number of features:",bestfeature)
|
391
|
-
print(" Best number of neighbors:",bestk)
|
392
|
-
|
393
|
-
return predicted_y,bestscore_train,bestscore_test,bestfeature,bestk
|
394
|
-
|
395
|
-
|
396
|
-
if __name__ =="__main__":
|
397
|
-
ticker='MSFT'
|
398
|
-
atdate="2020-4-2"
|
399
|
-
n_nextdays=1
|
400
|
-
info=forecast_stock_price(ticker,atdate,n_nextdays)
|
401
|
-
print(info)
|
402
|
-
|
403
|
-
#==============================================================================
|
404
|
-
def forecast_stock_price2(dfprice,n_nextdays,n_samples=252, \
|
405
|
-
maxk=20,maxFN=252,random_state=0):
|
406
|
-
"""
|
407
|
-
功能:预测未来第几天的股票收盘价,执行FN和FN2优化策略,单一随机数种子
|
408
|
-
"""
|
409
|
-
#生成机器学习样本1: 确定最佳特征个数bestf,不管过拟合/欠拟合问题
|
410
|
-
bestmodel1,bestf1,bestk1,bestscore_train1,bestscore_test1= \
|
411
|
-
bestFN(dfprice,n_nextdays,n_samples,random_state=random_state)
|
412
|
-
X,y,ndprice=make_price_sample(dfprice,n_nextdays,n_samples,bestf1)
|
413
|
-
|
414
|
-
#基于最新特征样本X_new,预测第n_nextdays的股价
|
415
|
-
X_new1=ndprice[0,0:bestf1]
|
416
|
-
y_new1=bestmodel1.predict(X_new1)
|
417
|
-
|
418
|
-
|
419
|
-
#生成机器学习样本2: 确定最佳特征个数bestf,考虑过拟合/欠拟合问题
|
420
|
-
bestmodel2,bestf2,bestk2,bestscore_train2,bestscore_test2,bestrate2= \
|
421
|
-
bestFN2(dfprice,n_nextdays,n_samples)
|
422
|
-
X,y,ndprice=make_price_sample(dfprice,n_nextdays,n_samples,bestf2)
|
423
|
-
X_new2=ndprice[0,0:bestf2]
|
424
|
-
y_new2=bestmodel2.predict(X_new2)
|
425
|
-
|
426
|
-
#最终决定:以最大测试成绩为优先
|
427
|
-
if bestscore_test1 <= bestscore_test2:
|
428
|
-
predicted_y=y_new2[0,0]
|
429
|
-
bestscore_train=bestscore_train2
|
430
|
-
bestscore_test=bestscore_test2
|
431
|
-
bestfeature=bestf2
|
432
|
-
bestk=bestk2
|
433
|
-
else:
|
434
|
-
predicted_y=y_new1[0,0]
|
435
|
-
bestscore_train=bestscore_train1
|
436
|
-
bestscore_test=bestscore_test1
|
437
|
-
bestfeature=bestf1
|
438
|
-
bestk=bestk1
|
439
|
-
|
440
|
-
return round(predicted_y,2),round(bestscore_train,4), \
|
441
|
-
round(bestscore_test,4),bestfeature,bestk
|
442
|
-
|
443
|
-
|
444
|
-
if __name__ =="__main__":
|
445
|
-
ticker='MSFT'
|
446
|
-
atdate="2020-4-2"
|
447
|
-
n_nextdays=1
|
448
|
-
dfprice=get_stock_price('MSFT','4/2/2020','1/1/2015')
|
449
|
-
info=forecast_stock_price2(dfprice,n_nextdays)
|
450
|
-
print(info)
|
451
|
-
|
452
|
-
#==============================================================================
|
453
|
-
def weighted_median(df,colname,colweight):
|
454
|
-
"""
|
455
|
-
功能:求加权中位数
|
456
|
-
输入:数据表df, 需要求中位数的列名colname, 权重所在的列名colweight
|
457
|
-
返回:50%中位数数值
|
458
|
-
"""
|
459
|
-
from statsmodels.stats.weightstats import DescrStatsW
|
460
|
-
wdf = DescrStatsW(df[colname], weights=df[colweight], ddof=1)
|
461
|
-
|
462
|
-
if len(df) >= 3:
|
463
|
-
wmedianlist=list(wdf.quantile([0.50]))
|
464
|
-
wmedian=wmedianlist[0]
|
465
|
-
elif len(df) == 2:
|
466
|
-
wmedian=(df[colname][0]*df[colweight][0]+df[colname][1]*df[colweight][1])/(df[colweight][0]+df[colweight][1])
|
467
|
-
elif len(df) == 1:
|
468
|
-
wmedian=df[colname][0]
|
469
|
-
else:
|
470
|
-
return None
|
471
|
-
|
472
|
-
return wmedian
|
473
|
-
|
474
|
-
if __name__ =="__main__":
|
475
|
-
import pandas as pd
|
476
|
-
df=pd.DataFrame({ 'x':range(1,3), 'wt':range(1,3) })
|
477
|
-
colname='x'
|
478
|
-
colweight='wt'
|
479
|
-
weighted_median(df,colname,colweight)
|
480
|
-
|
481
|
-
#==============================================================================
|
482
|
-
def second2time(seconds):
|
483
|
-
"""
|
484
|
-
功能:将秒数转换为时分秒
|
485
|
-
输入:秒数
|
486
|
-
返回:时分秒,字符串
|
487
|
-
"""
|
488
|
-
hours=int(seconds/3600)
|
489
|
-
minutes=int((seconds-hours*3600)/60)
|
490
|
-
|
491
|
-
if seconds >= 60:
|
492
|
-
decm=1
|
493
|
-
elif seconds >= 10:
|
494
|
-
decm=1
|
495
|
-
elif seconds >= 0.1:
|
496
|
-
decm=2
|
497
|
-
else:
|
498
|
-
decm=4
|
499
|
-
miaos=round(seconds-hours*3600-minutes*60,decm)
|
500
|
-
timestr=str(hours)+":"+str(minutes)+":"+str(miaos)
|
501
|
-
|
502
|
-
return timestr
|
503
|
-
|
504
|
-
if __name__ =="__main__":
|
505
|
-
second2time(590.58963)
|
506
|
-
second2time(65.456321)
|
507
|
-
second2time(35.75698)
|
508
|
-
second2time(5.75698)
|
509
|
-
second2time(0.75698)
|
510
|
-
second2time(0.00098)
|
511
|
-
#==============================================================================
|
512
|
-
def save_to_excel(df,excelfile="myfile01.xlsx",sheetname="Sheet1"):
|
513
|
-
"""
|
514
|
-
函数功能:将df保存到当前目录下的Excel文件。
|
515
|
-
如果未指定Excel文件则默认为"myfile.xls"
|
516
|
-
如果Excel文件不存在则创建文件并保存到指定的sheetname;如果未指定sheetname则默
|
517
|
-
认为"First"
|
518
|
-
如果Excel文件存在但sheetname不存在则增加sheetname并保存df内容,原有sheet内容
|
519
|
-
不变;
|
520
|
-
如果Excel文件和sheetname都存在则追加df内容到已有sheet的末尾
|
521
|
-
输入参数:
|
522
|
-
df: 数据框
|
523
|
-
excelfile: Excel文件名,不带目录,后缀为.xls或.xlsx
|
524
|
-
sheetname:Excel文件中的sheet名
|
525
|
-
输出:
|
526
|
-
保存df到Excel文件
|
527
|
-
无返回数据
|
528
|
-
注意:如果df中含有以文本表示的数字,写入到Excel会被自动转换为数字类型保存。
|
529
|
-
从Excel中读出后为数字类型,因此将会与df的类型不一致
|
530
|
-
"""
|
531
|
-
#取得df字段列表
|
532
|
-
dflist=list(df)
|
533
|
-
#合成完整的带目录的文件名
|
534
|
-
filename=excelfile
|
535
|
-
|
536
|
-
import pandas as pd
|
537
|
-
try:
|
538
|
-
file1=pd.ExcelFile(excelfile)
|
539
|
-
except:
|
540
|
-
#不存在excelfile文件,直接写入
|
541
|
-
df.to_excel(filename,sheet_name=sheetname, \
|
542
|
-
header=True,encoding='utf-8')
|
543
|
-
print("*** Results saved in",filename,"@ sheet",sheetname)
|
544
|
-
return
|
545
|
-
else:
|
546
|
-
#已存在excelfile文件,先将所有sheet的内容读出到dict中
|
547
|
-
dict=pd.read_excel(file1, None)
|
548
|
-
file1.close()
|
549
|
-
|
550
|
-
#获得所有sheet名字
|
551
|
-
sheetlist=list(dict.keys())
|
552
|
-
#检查新的sheet名字是否已存在
|
553
|
-
try:
|
554
|
-
pos=sheetlist.index(sheetname)
|
555
|
-
except:
|
556
|
-
#不存在重复
|
557
|
-
dup=False
|
558
|
-
else:
|
559
|
-
#存在重复,合并内容
|
560
|
-
dup=True
|
561
|
-
#合并之前可能需要对df中以字符串表示的数字字段进行强制类型转换.astype('int')
|
562
|
-
df1=dict[sheetlist[pos]][dflist]
|
563
|
-
dfnew=pd.concat([df1,df],axis=0,ignore_index=True)
|
564
|
-
dict[sheetlist[pos]]=dfnew
|
565
|
-
|
566
|
-
#将原有内容写回excelfile
|
567
|
-
result=pd.ExcelWriter(filename)
|
568
|
-
for s in sheetlist:
|
569
|
-
df1=dict[s][dflist]
|
570
|
-
df1.to_excel(result,s,header=True,index=True,encoding='utf-8')
|
571
|
-
#写入新内容
|
572
|
-
if not dup: #sheetname未重复
|
573
|
-
df.to_excel(result,sheetname,header=True,index=True,encoding='utf-8')
|
574
|
-
try:
|
575
|
-
result.save()
|
576
|
-
result.close()
|
577
|
-
except:
|
578
|
-
print("... Error(save_to_excel): writing file failed",filename,"@ sheet",sheetname)
|
579
|
-
print("Information:",filename)
|
580
|
-
return
|
581
|
-
print("*** Results saved in",filename,"@ sheet",sheetname)
|
582
|
-
|
583
|
-
return
|
584
|
-
|
585
|
-
|
586
|
-
#==============================================================================
|
587
|
-
def forecast_stock_price_rs(ticker,atdate,n_nextdays=1,n_samples=252, \
|
588
|
-
maxk=20,maxFN=252,random_state=0,maxRS=9, \
|
589
|
-
excelfile="myfile01.xlsx",sheetname="Sheet1"):
|
590
|
-
"""
|
591
|
-
功能:预测未来第几天的股票收盘价,试验随机数种子策略
|
592
|
-
输入参数:
|
593
|
-
1、ticker: 股票代码
|
594
|
-
2、atdate: 当前日期,可以是今天或以前的一天
|
595
|
-
3、n_nextdays: 以atdate为基准向前推进几个交易日,预测该日期的股价
|
596
|
-
4、n_samples: 生成机器学习用的样本中的最大观察数目。
|
597
|
-
跨年的样本有助于模型学习季节性效应,3年的样本效果好于2年,
|
598
|
-
2年的样本效果好于1年
|
599
|
-
5、maxk:试探的最大邻居个数
|
600
|
-
6、maxFN:试探的最大特征个数
|
601
|
-
7、random_state: 开始试探时的随机数种子
|
602
|
-
8、maxRS: 用于试探的最大的随机数种子
|
603
|
-
9、excelfile:保存文件的名字
|
604
|
-
10、sheetname:Excel文件的sheet名字
|
605
|
-
输出:每次迭代取得更好的测试集分数时,输出模型参数和预测的股价
|
606
|
-
返回:最优测试集的模型参数及预测的股价,以及各个迭代最优结果下预测的股价的
|
607
|
-
加权中位数,权重为各个测试集分数。
|
608
|
-
"""
|
609
|
-
#检查日期的合理性
|
610
|
-
if not isdate(atdate):
|
611
|
-
print(".Error(forecast_stock_price_rs), invalid date:",atdate)
|
612
|
-
return None
|
613
|
-
|
614
|
-
#开始计时
|
615
|
-
print("\n... Predicting stock price, it may take very long time, please wait ......")
|
616
|
-
import time
|
617
|
-
time0 = time.perf_counter()
|
618
|
-
|
619
|
-
#设定起始日期:
|
620
|
-
nyears=int((n_nextdays + n_samples + maxFN + 1)/252)+2
|
621
|
-
start=date_adjust(atdate,-366*nyears)
|
622
|
-
|
623
|
-
#抓取股价数据
|
624
|
-
dfprice=get_stock_price(ticker,atdate,start)
|
625
|
-
if dfprice is None:
|
626
|
-
print(".Error(forecast_stock_price_rs), failed to capture stock prices:",ticker)
|
627
|
-
return None
|
628
|
-
if len(dfprice) < (n_nextdays + n_samples + maxFN + 1):
|
629
|
-
print(".Error(forecast_stock_price_rs), insufficient number of stock prices!")
|
630
|
-
return None
|
631
|
-
|
632
|
-
#设置测试集分数起点
|
633
|
-
bestscore_test=0.0
|
634
|
-
#建立结果表结构
|
635
|
-
import pandas as pd
|
636
|
-
result=pd.DataFrame(columns=('ticker','atdate','n_nextdays','n_samples', \
|
637
|
-
'random_state','pred_y','bestscore_train', \
|
638
|
-
'bestscore_test','bestfeature','bestk'))
|
639
|
-
#倒序随机数种子,便于尽快看到最优结果
|
640
|
-
rslist=list(range(random_state,maxRS+1))
|
641
|
-
rslist.reverse()
|
642
|
-
#开始逐一试探各个随机数种子的最佳分数
|
643
|
-
for rs in rslist:
|
644
|
-
print("... Testing random seed:",rs)
|
645
|
-
pred_y0,bestscore_train0,bestscore_test0,bestfeature0,bestk0= \
|
646
|
-
forecast_stock_price2(dfprice,n_nextdays=n_nextdays, \
|
647
|
-
n_samples=n_samples,maxk=maxk, \
|
648
|
-
maxFN=maxFN,random_state=rs)
|
649
|
-
|
650
|
-
#记录中间结果
|
651
|
-
row=pd.Series({'ticker':ticker,'atdate':atdate,'n_nextdays':n_nextdays, \
|
652
|
-
'n_samples':n_samples,'random_state':rs,'pred_y':pred_y0, \
|
653
|
-
'bestscore_train':bestscore_train0,'bestscore_test':bestscore_test0, \
|
654
|
-
'bestfeature':bestfeature0,'bestk':bestk0})
|
655
|
-
result=result.append(row,ignore_index=True)
|
656
|
-
|
657
|
-
#更新最佳纪录
|
658
|
-
if bestscore_test < bestscore_test0:
|
659
|
-
pred_y=pred_y0
|
660
|
-
bestscore_train=bestscore_train0
|
661
|
-
bestscore_test=bestscore_test0
|
662
|
-
bestfeature=bestfeature0
|
663
|
-
bestk=bestk0
|
664
|
-
|
665
|
-
print(" Predicted stock price :",pred_y)
|
666
|
-
print(" Best score on train :",bestscore_train)
|
667
|
-
print(" Best score on test :",bestscore_test)
|
668
|
-
print(" Best number of features :",bestfeature)
|
669
|
-
print(" Best number of neighbors:",bestk,"\n")
|
670
|
-
|
671
|
-
#再度显示中间结果
|
672
|
-
pd.set_option('display.unicode.ambiguous_as_wide', True)
|
673
|
-
pd.set_option('display.unicode.east_asian_width', True)
|
674
|
-
pd.set_option('display.width', 180) # 设置打印宽度(**重要**)
|
675
|
-
print("... Summary:")
|
676
|
-
print(result.to_string(index=False))
|
677
|
-
print("\n... Result by highest score on test:",result['pred_y'][-1].values[0])
|
678
|
-
|
679
|
-
#计算运行时间
|
680
|
-
time1 = time.perf_counter()
|
681
|
-
elapsed=time1 - time0
|
682
|
-
print("... Total elapsed time is",second2time(elapsed))
|
683
|
-
|
684
|
-
save_to_excel(result,excelfile,sheetname)
|
685
|
-
print("... Results saved in an Excel file:",excelfile,"@sheet",sheetname)
|
686
|
-
|
687
|
-
return result
|
688
|
-
|
689
|
-
if __name__ =="__main__":
|
690
|
-
ticker='MSFT'
|
691
|
-
atdate="2020-4-5"
|
692
|
-
n_nextdays=1
|
693
|
-
maxRS=1
|
694
|
-
info=forecast_stock_price_rs(ticker,atdate,n_nextdays,maxRS=maxRS)
|
695
|
-
print(info.to_string(index=False))
|
696
|
-
|
697
|
-
#==============================================================================
|
698
|
-
def multisummary(result,notes='',top=5):
|
699
|
-
"""
|
700
|
-
功能:计算其加权中位数
|
701
|
-
输入参数:
|
702
|
-
1、result: 各个随机数种子下的最优预测值
|
703
|
-
2、top: 采用测试分数最高的几个结果参加加权中位数计算
|
704
|
-
输出:加权中位数
|
705
|
-
返回:预测的股价的加权中位数,权重为各个测试集分数。
|
706
|
-
"""
|
707
|
-
|
708
|
-
#检查文件是否为空
|
709
|
-
if len(result)==0:
|
710
|
-
print("... Error(multisummary), No data recieved!")
|
711
|
-
return None
|
712
|
-
|
713
|
-
#排序: 升序
|
714
|
-
result.sort_values(by=["bestscore_test","bestfeature"],ascending=[True,True],inplace=True)
|
715
|
-
|
716
|
-
#对预测的股价取加权中位数
|
717
|
-
if len(result) < top: top=len(result)
|
718
|
-
topdata=result.tail(top)
|
719
|
-
pred_y_wmedian=round(weighted_median(topdata,'pred_y','bestscore_test'),2)
|
720
|
-
|
721
|
-
#显示详细结果
|
722
|
-
import pandas as pd
|
723
|
-
pd.set_option('display.unicode.ambiguous_as_wide', True)
|
724
|
-
pd.set_option('display.unicode.east_asian_width', True)
|
725
|
-
pd.set_option('display.width', 180) # 设置打印宽度(**重要**)
|
726
|
-
|
727
|
-
print("\n... Summary:",notes)
|
728
|
-
print(result.to_string(index=False))
|
729
|
-
hsotest=round(result.tail(1)['pred_y'].values[0],2)
|
730
|
-
if notes == 'final':
|
731
|
-
print("\n... Predicted price by highest score on test:",hsotest)
|
732
|
-
print("... Predicted in median weighted by score on test:",pred_y_wmedian)
|
733
|
-
|
734
|
-
return hsotest,pred_y_wmedian
|
735
|
-
|
736
|
-
if __name__ =="__main__":
|
737
|
-
wmprice=multisummary(result,top=5)
|
738
|
-
|
739
|
-
#==============================================================================
|
740
|
-
def forecast_stock_price3(dfprice,n_nextdays=1,n_samples=252*3, \
|
741
|
-
maxk=20,maxFN=252*3,random_state=0):
|
742
|
-
"""
|
743
|
-
功能:预测未来第几天的股票收盘价,试验单个随机数种子策略。可作为独立进程
|
744
|
-
输入参数:
|
745
|
-
1、dfprice: 抓取的股价数据集
|
746
|
-
2、n_nextdays: 以atdate为基准向前推进几个交易日,预测该日期的股价
|
747
|
-
3、n_samples: 生成机器学习用的样本中的最大观察数目。
|
748
|
-
跨年的样本有助于模型学习季节性效应,3年的样本效果好于2年,
|
749
|
-
2年的样本效果好于1年
|
750
|
-
4、maxk:试探的最大邻居个数
|
751
|
-
5、maxFN:试探的最大特征个数
|
752
|
-
6、random_state: 随机数种子
|
753
|
-
输出:单次迭代取得更好的测试集分数时,输出模型参数和预测的股价
|
754
|
-
返回:最优测试集的模型参数及预测的股价。
|
755
|
-
"""
|
756
|
-
#显示进程号
|
757
|
-
import multiprocessing as mp
|
758
|
-
pname=mp.current_process().name
|
759
|
-
print("... Starting sub-process",pname,"with random_state",random_state)
|
760
|
-
|
761
|
-
#试探一个随机数种子的最佳分数
|
762
|
-
pred_y0,bestscore_train0,bestscore_test0,bestfeature0,bestk0= \
|
763
|
-
forecast_stock_price2(dfprice,n_nextdays=n_nextdays, \
|
764
|
-
n_samples=n_samples,maxk=maxk, \
|
765
|
-
maxFN=maxFN,random_state=random_state)
|
766
|
-
#记录中间结果
|
767
|
-
import pandas as pd
|
768
|
-
row=pd.Series({'random_state':random_state,'pred_y':pred_y0, \
|
769
|
-
'bestscore_train':bestscore_train0,'bestscore_test':bestscore_test0, \
|
770
|
-
'bestfeature':bestfeature0,'bestk':bestk0})
|
771
|
-
|
772
|
-
print("... Endting sub-process",pname)
|
773
|
-
return row
|
774
|
-
|
775
|
-
if __name__ =="__main__":
|
776
|
-
ticker='MSFT'
|
777
|
-
atdate="2020-4-5"
|
778
|
-
n_nextdays=1
|
779
|
-
random_state=0
|
780
|
-
info=forecast_stock_price3(dfprice,n_nextdays,random_state=random_state)
|
781
|
-
print(info)
|
782
|
-
|
783
|
-
#==============================================================================
|
784
|
-
def forecast_stock_price_mp(ticker,atdate,n_nextdays=1,n_samples=252*3, \
|
785
|
-
maxk=20,maxFN=252*3,random_state=0,maxRS=9,top=5):
|
786
|
-
"""
|
787
|
-
功能:预测未来第几天的股票收盘价,试验随机数种子策略,多进程
|
788
|
-
输入参数:
|
789
|
-
1、ticker: 股票代码
|
790
|
-
2、atdate: 当前日期,可以是今天或以前的一天
|
791
|
-
3、n_nextdays: 以atdate为基准向前推进几个交易日,预测该日期的股价
|
792
|
-
4、n_samples: 生成机器学习用的样本中的最大观察数目。
|
793
|
-
跨年的样本有助于模型学习季节性效应,3年的样本效果好于2年,
|
794
|
-
2年的样本效果好于1年
|
795
|
-
5、maxk:试探的最大邻居个数
|
796
|
-
6、maxFN:试探的最大特征个数
|
797
|
-
7、random_state: 开始试探时的随机数种子
|
798
|
-
8、maxRS: 用于试探的最大的随机数种子
|
799
|
-
9、top: 最后中参与计算加权中位数的个数
|
800
|
-
输出:每次迭代取得更好的测试集分数时,输出模型参数和预测的股价
|
801
|
-
返回:最优测试集的模型参数及预测的股价,以及各个迭代最优结果下预测的股价的
|
802
|
-
加权中位数,权重为各个测试集分数。
|
803
|
-
"""
|
804
|
-
#调试开关
|
805
|
-
DEBUG=True
|
806
|
-
|
807
|
-
#检查日期的合理性
|
808
|
-
if not isdate(atdate):
|
809
|
-
print(".Error(forecast_stock_price_rs), invalid date:",atdate)
|
810
|
-
return None
|
811
|
-
|
812
|
-
#开始信息
|
813
|
-
print("\n... Predicting stock price by knn model ......")
|
814
|
-
print(" Stock:",ticker)
|
815
|
-
print(" Observation date:",atdate)
|
816
|
-
print(" Number of trading day(s) being predicted:",n_nextdays)
|
817
|
-
print(" Max number of historical prices used:",n_samples)
|
818
|
-
print(" Max number of features used in knn:",maxFN)
|
819
|
-
print(" Max number of neighbors used in knn:",maxk)
|
820
|
-
print(" Max number of obs used in weighted meadian:",top)
|
821
|
-
print(" WARNING: It may take long time, please wait ......")
|
822
|
-
#开始计时
|
823
|
-
import time; time0 = time.perf_counter()
|
824
|
-
|
825
|
-
print("\n... Capturing historical stock prices ......",end='')
|
826
|
-
#设定起始日期:
|
827
|
-
nyears=int((n_nextdays + n_samples + maxFN + 1)/252)+1
|
828
|
-
start=date_adjust(atdate,-366*nyears)
|
829
|
-
#抓取股价数据
|
830
|
-
dfprice=get_stock_price(ticker,atdate,start)
|
831
|
-
if dfprice is None:
|
832
|
-
print("\n Error(forecast_stock_price_mp), failed to capture stock prices:",ticker)
|
833
|
-
return None
|
834
|
-
if len(dfprice) < (n_nextdays + n_samples + maxFN + 1):
|
835
|
-
print("\n Error(forecast_stock_price_mp), insufficient number of stock prices!")
|
836
|
-
return None
|
837
|
-
print(", done!")
|
838
|
-
print(" ",len(dfprice),"historical stock prices captured")
|
839
|
-
|
840
|
-
print("... Start machine-learning using knn model in multiprocessing ......")
|
841
|
-
#倒序随机数种子,便于尽快看到最优结果
|
842
|
-
rslist=list(range(random_state,maxRS+1)); rslist.reverse()
|
843
|
-
jobnum=len(rslist)
|
844
|
-
|
845
|
-
#电脑CPU核心数
|
846
|
-
import os; cores=os.cpu_count()
|
847
|
-
print(" There are",cores,"core(s) inside the cpu of this computer")
|
848
|
-
#确定进程池大小
|
849
|
-
if cores <= 4: procnum=cores+1
|
850
|
-
else: procnum=cores
|
851
|
-
#确定多进程分组组数
|
852
|
-
groupnum=int(jobnum / procnum); remain=jobnum % procnum
|
853
|
-
if remain > 0: groupnum+=1
|
854
|
-
group=list(range(groupnum))
|
855
|
-
|
856
|
-
#建立数据集:记录各个进程输出结果
|
857
|
-
import pandas as pd
|
858
|
-
result=pd.DataFrame(columns=('random_state','pred_y','bestscore_train', \
|
859
|
-
'bestscore_test','bestfeature','bestk'))
|
860
|
-
#分组多任务
|
861
|
-
import multiprocessing as mp
|
862
|
-
for g in group:
|
863
|
-
grpstart=g*procnum; grpend=(g+1)*procnum
|
864
|
-
if grpend > jobnum: grpend=jobnum
|
865
|
-
|
866
|
-
#创建进程池
|
867
|
-
timep0 = time.perf_counter()
|
868
|
-
pool=mp.Pool(processes=procnum)
|
869
|
-
print("\n... Pool",g,"created with max capacity of",procnum,"processes in parallel")
|
870
|
-
#建立多进程
|
871
|
-
mptasks=[pool.apply_async(forecast_stock_price3,args=(dfprice,n_nextdays, \
|
872
|
-
n_samples,maxk,maxFN,i,)) for i in list(range(grpstart,grpend))]
|
873
|
-
pool.close()
|
874
|
-
pool.join()
|
875
|
-
|
876
|
-
#记录组内各个最佳结果
|
877
|
-
for res in mptasks:
|
878
|
-
row=res.get()
|
879
|
-
result=result.append(row,ignore_index=True)
|
880
|
-
print(" Completed processes for random_state",list(range(grpstart,grpend)))
|
881
|
-
h0,wmp0=multisummary(result[grpstart:grpend+1],notes="Pool "+str(g),top=top)
|
882
|
-
#计算组内运行时间
|
883
|
-
timep1 = time.perf_counter(); elapsedp=timep1 - timep0
|
884
|
-
print(" Elapsed time in Pool",g,"is",second2time(elapsedp))
|
885
|
-
|
886
|
-
#排序最后结果
|
887
|
-
result.sort_values(by=['bestscore_test'],ascending=True,inplace=True)
|
888
|
-
|
889
|
-
#显示结果
|
890
|
-
hsotest,wmprice=multisummary(result,'final',top)
|
891
|
-
|
892
|
-
#计算总体运行时间
|
893
|
-
time1 = time.perf_counter(); elapsed=time1 - time0
|
894
|
-
print("\n... Total elapsed time is",second2time(elapsed))
|
895
|
-
|
896
|
-
return hsotest,wmprice
|
897
|
-
|
898
|
-
if __name__ =="__main__":
|
899
|
-
ticker='MSFT'
|
900
|
-
atdate="2020-4-5"
|
901
|
-
n_nextdays=1
|
902
|
-
minRS=0
|
903
|
-
maxRS=2
|
904
|
-
predicted_prices=forecast_stock_price_mp(ticker,atdate,n_nextdays, \
|
905
|
-
random_state=minRS,maxRS=maxRS)
|
906
|
-
|
907
|
-
#==============================================================================
|
908
|
-
|
909
|
-
#==============================================================================
|
910
|
-
|