getfactormodels 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of getfactormodels might be problematic. Click here for more details.

@@ -0,0 +1,43 @@
1
+ # -*- coding: utf-8 -*-
2
+ # MIT License
3
+ #
4
+ # Copyright (c) 2023 S. Martin
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in all
14
+ # copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ # SOFTWARE.
23
+ __version__ = "0.0.1"
24
+
25
+ from .__main__ import FactorExtractor, get_factors
26
+ from .models import models # noqa: F401
27
+ from .models.models import (barillas_shanken_factors, carhart_factors,
28
+ dhs_factors, ff_factors, hml_devil_factors,
29
+ icr_factors, liquidity_factors, mispricing_factors,
30
+ q_classic_factors, q_factors)
31
+
32
+ __all__ = ["FactorExtractor",
33
+ "ff_factors",
34
+ "icr_factors",
35
+ "q_factors",
36
+ "q_classic_factors",
37
+ "mispricing_factors",
38
+ "dhs_factors",
39
+ "liquidity_factors",
40
+ "hml_devil_factors",
41
+ "barillas_shanken_factors",
42
+ "carhart_factors",
43
+ "get_factors", ]
@@ -0,0 +1,168 @@
1
+ # -*- coding: utf-8 -*-
2
+ # ruff: noqa: F401
3
+ # TODO: rename __main__.py
4
+ import os
5
+ import pandas as pd
6
+ from dateutil import parser
7
+ from getfactormodels.models.models import barillas_shanken_factors # noqa
8
+ from getfactormodels.models.models import (carhart_factors, dhs_factors,
9
+ ff_factors, hml_devil_factors,
10
+ icr_factors, liquidity_factors,
11
+ mispricing_factors,
12
+ q_classic_factors, q_factors)
13
+ from getfactormodels.utils.cli import parse_args
14
+ from getfactormodels.utils.utils import _get_model_key, _process
15
+
16
+
17
+ def get_factors(model: str = "3", # noqa: C901
18
+ frequency: str = "M",
19
+ start_date=None,
20
+ end_date=None,
21
+ output=None) -> pd.DataFrame:
22
+ """Get data for a specified factor model.
23
+
24
+ Return a DataFrame containing the data for the specified model and
25
+ frequency. If an output is specified, factor data is saved to a file.
26
+
27
+ Notes:
28
+ - Any string matching a model's regex (e.g., `liq` for `liquidity`) can be
29
+ used as a model name.
30
+ - Dates should be in ``YYYY-MM-DD`` format, but anything that
31
+ ``dateutil.parser.parse()`` can interpret will work.
32
+ - Weekly data is only available for the q-factor and Fama-French 3-factor
33
+ models.
34
+
35
+ Parameters:
36
+ model (str): the factor model to return. One of: `liquidity`,
37
+ `icr`, `dhs`, `q`, `q_classic`, `ff3`, `ff5`, `ff6`, `carhart4`,
38
+ `hml_devil`, `barrilas_shanken`, or `mispricing`.
39
+ frequency (str): the frequency of the data. D, W, M or A (default: M).
40
+ start_date (str, optional): the start date of the data, YYYY-MM-DD.
41
+ end_date (str, optional): the end date of the data, YYYY-MM-DD.
42
+ output (str, optional): a filename, directory, or filepath. Accepts
43
+ '.txt', '.csv', '.md', '.xlsx', '.pkl' as file extensions.
44
+
45
+ Returns:
46
+ pandas.DataFrame: factor data, indexed by date.
47
+ """
48
+ frequency = frequency.lower()
49
+ model = _get_model_key(model)
50
+
51
+ # Get the function by its name, if it exists call it with params
52
+ if model in ["3", "4", "5", "6"]:
53
+ return ff_factors(model, frequency, start_date, end_date)
54
+ else:
55
+ function_name = f"{model}_factors"
56
+ function = globals().get(function_name)
57
+
58
+ if not function:
59
+ raise ValueError(f"Invalid model: {model}")
60
+
61
+ df = function(frequency, start_date, end_date, output)
62
+ return df
63
+
64
+
65
+ class FactorExtractor:
66
+ """
67
+ Extracts factor data based on specified parameters.
68
+
69
+ Args:
70
+ model : str
71
+ The factor model to use. Defaults to '3'.
72
+ frequency (str, optional): The frequency of the data. Defaults to 'M'.
73
+ start_date (str, optional): The start date of the data.
74
+ end_date (str, optional): The end date of the data.
75
+
76
+ Methods:
77
+ drop_rf: Drops the 'RF' column from the DataFrame.
78
+ save_factors: Saves the factor data to a file.
79
+ """
80
+
81
+ def __init__(self,
82
+ model='3',
83
+ frequency='M',
84
+ start_date=None,
85
+ end_date=None,
86
+ output=None):
87
+ self.model: str = model
88
+ self.frequency: str = frequency
89
+ self.start_date = self.validate_date_format(start_date) if start_date \
90
+ else None
91
+ self.end_date = self.validate_date_format(end_date) if end_date \
92
+ else None
93
+ self.output = output
94
+ self._no_rf = False
95
+ self.df = None
96
+
97
+ def no_rf(self):
98
+ """Sets the _no_rf flag to True."""
99
+ self._no_rf = True
100
+
101
+ @staticmethod
102
+ def validate_date_format(date_string):
103
+ """
104
+ Validate the date format.
105
+
106
+ Raises:
107
+ ValueError: If the date format is incorrect.
108
+ """
109
+ try:
110
+ return parser.parse(date_string).strftime("%Y-%m-%d")
111
+ except ValueError as err:
112
+ raise ValueError("Incorrect date format, use YYYY-MM-DD.") from err
113
+
114
+ def get_factors(self) -> pd.DataFrame:
115
+ """Fetch the factor data and store it in the class."""
116
+ self.df = get_factors(
117
+ model=self.model,
118
+ frequency=self.frequency,
119
+ start_date=self.start_date,
120
+ end_date=self.end_date)
121
+
122
+ if self._no_rf:
123
+ self.df = self.drop_rf(self.df)
124
+
125
+ return self.df
126
+
127
+ def drop_rf(self, df):
128
+ """Drop the ``RF`` column from the DataFrame."""
129
+ if "RF" in df.columns:
130
+ df = df.drop(columns=["RF"])
131
+ else:
132
+ print("`drop_rf` was called but no RF column was found.")
133
+
134
+ return df
135
+
136
+ def to_file(self, filename):
137
+ """
138
+ Save the factor data to a file.
139
+
140
+ Args:
141
+ filename (str): The name of the file to save the data to.
142
+ """
143
+ if self.df is None:
144
+ raise ValueError("No data to save. Fetch factors first.")
145
+
146
+ # TODO: could call _save_to_file directly
147
+ _process(self.df, filepath=filename)
148
+
149
+
150
+ def main():
151
+ args = parse_args()
152
+
153
+ extractor = FactorExtractor(model=args.model, frequency=args.freq,
154
+ start_date=args.start, end_date=args.end)
155
+ if args.no_rf:
156
+ extractor.no_rf()
157
+
158
+ df = extractor.get_factors()
159
+
160
+ if args.output:
161
+ extractor.to_file(args.output)
162
+ print(f'File saved to "{os.path.abspath(args.output)}"')
163
+ else:
164
+ print(df)
165
+
166
+
167
+ if __name__ == '__main__':
168
+ main()
@@ -0,0 +1,24 @@
1
+ # -*- coding: utf-8 -*-
2
+ # MIT License
3
+ #
4
+ # Copyright (c) 2023 S. Martin
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ # SOFTWARE.
23
+ from . import ff_models # noqa: F401 - TODO: disable 401 in all __init__
24
+ from . import models # noqa: F401
@@ -0,0 +1,141 @@
1
+ # -*- coding: utf-8 -*-
2
+ import numpy as np
3
+ import pandas as pd # noqa: D100
4
+ from ..utils.utils import ( # noqa - todo: fix relative import from parent modules banned
5
+ _process, get_zip_from_url)
6
+
7
+
8
+ def _ff_construct_url(model="3", frequency="M"):
9
+ """Construct and return the URL for the specified model and frequency."""
10
+ frequency = frequency.upper()
11
+
12
+ if frequency == "W" and model not in ["3", "4"]: # why 4?
13
+ raise ValueError("Weekly data is only available for the Fama \
14
+ French 3 factor model at the moment.")
15
+
16
+ base_url = "https://mba.tuck.dartmouth.edu"
17
+ ftp = "pages/faculty/ken.french/ftp"
18
+
19
+ file = f'F-F_{"Research_Data_" if model in ["3", "4", "5", "6"] else ""}'
20
+ file += ("Factors" if model in ["3", "4"]
21
+ else "5_Factors_2x3" if model in ["5", "6"]
22
+ else "")
23
+ file += "_daily" if frequency == "D" \
24
+ else "_weekly" if frequency == "W" else ""
25
+ file += "_CSV.zip"
26
+
27
+ return f"{base_url}/{ftp}/{file}"
28
+
29
+
30
+ def ff_read_csv_from_zip(zip_file, model=None):
31
+ """Read the FF Factors CSV into a dataframe."""
32
+ try:
33
+ filename = zip_file.namelist()[0]
34
+ with zip_file.open(filename) as file:
35
+ data = pd.read_csv(
36
+ file,
37
+ skiprows=12 if 'momentum' in filename.lower() else 3 if 'ly' in filename.lower() else 2, # noqa: E501
38
+ index_col=0,
39
+ header=0,
40
+ parse_dates=False,
41
+ skipfooter=1,
42
+ engine="python")
43
+
44
+ data.index = data.index.astype(str)
45
+ data.index = data.index.str.strip()
46
+ data.index.name = "date"
47
+ data = data.dropna()
48
+ except Exception as e:
49
+ print(f"Error reading file: {e}")
50
+ return None
51
+ return data
52
+
53
+
54
+ def ff_process_data(data, model, frequency) -> pd.DataFrame:
55
+ """Process and return the data based on the provided model and frequency.
56
+ """
57
+ frequency = frequency.lower()
58
+
59
+ if frequency == 'm':
60
+ data = data[data.index.str.len() == 6]
61
+ elif frequency == 'y':
62
+ data = data[data.index.str.len() == 4]
63
+ else:
64
+ data = data[data.index.str.len() == 8]
65
+
66
+ try:
67
+ if frequency == 'm':
68
+ data.index = pd.to_datetime(data.index, format='%Y%m') \
69
+ + pd.offsets.MonthEnd(0)
70
+ else:
71
+ data.index = pd.to_datetime(data.index, format='%Y%m%d')
72
+
73
+ except Exception:
74
+ data.index = pd.to_datetime(data.index, format='%Y') \
75
+ + pd.offsets.YearEnd(0, month=12)
76
+
77
+ data.index.name = "date"
78
+
79
+ # All values (eg, 4/D, are <5% distinct).
80
+ # If <10% distinct, categorize
81
+ # if len(data) / data.nunique() < 10:
82
+ # data = data.astype('category')
83
+
84
+ return data
85
+
86
+
87
+ def _ff_get_mom(frequency) -> pd.Series:
88
+ """Fetch and return the momentum factor data as a pd.Series.
89
+ * Note: only for returning the raw data for the 4 and 6 factor models.
90
+ """
91
+ frequency = frequency.upper()
92
+ base_url = "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp"
93
+ file = "F-F_Momentum_Factor_daily_CSV.zip" if frequency == "D" \
94
+ else "F-F_Momentum_Factor_CSV.zip"
95
+ url = f"{base_url}/{file}"
96
+
97
+ csv = ff_read_csv_from_zip(get_zip_from_url(url))
98
+
99
+ csv.columns = ["MOM"]
100
+ csv.index.name = "date"
101
+
102
+ csv.index = csv.index.astype(str)
103
+ csv.index = csv.index.str.strip()
104
+
105
+ return csv
106
+
107
+
108
+ def _get_ff_factors(model: str = "3",
109
+ frequency: str = "M",
110
+ start_date=None,
111
+ end_date=None) -> pd.DataFrame:
112
+ """Return the Fama French 3, 5, or 6, or Carhart 4 factor model data.
113
+
114
+ * Note: This is the function that's called by get_ff_factors in main.
115
+ """
116
+ if frequency is None:
117
+ frequency = "M"
118
+ url = _ff_construct_url(model, frequency)
119
+ zip = get_zip_from_url(url)
120
+ csv = ff_read_csv_from_zip(zip, model)
121
+
122
+ if model in ["4", "6"]:
123
+ mom = _ff_get_mom(frequency)
124
+ if model == "6":
125
+ mom = mom.rename(columns={"MOM": "UMD"})
126
+ mom = pd.DataFrame(mom)
127
+ csv = csv.join(mom, how="left")
128
+
129
+ data = ff_process_data(csv, model, frequency)
130
+ data = data.apply(pd.to_numeric, errors='ignore')
131
+
132
+ if start_date is not None or end_date is not None:
133
+ data = data.loc[start_date:end_date]
134
+
135
+ data = data.dropna()
136
+
137
+ data = np.multiply(data, 0.01)
138
+ return _process(data, start_date, end_date)
139
+
140
+
141
+ # TODO: just redo all of this.
@@ -0,0 +1,462 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ models
4
+ =======
5
+ Functions for retrieving and processing multi-factor model data.
6
+
7
+ Functions for fetching data for a variety of factor models. The data can be
8
+ returned for different frequencies and for a specified date range. The function
9
+ supports a variety of model names that match specific regex patterns, including
10
+ 'liquidity', 'icr', 'dhs', 'q', 'q_classic', 'ff3', 'ff5', 'ff6', 'carhart4',
11
+ 'hml_devil', 'barrilas_shanken', and 'mispricing'.
12
+
13
+ Functions
14
+ ---------
15
+ - `ff_factors`: Retrieves data for a specified Fama-French or Carhart factor
16
+ model.
17
+ - `q_factors`: Retrieves the q-factor model data from global-q.org.
18
+ - `q_classic_factors`: Retrieves the original 4-factor "q" model of Hou, Xue,
19
+ and Zhang (2015).
20
+ - `dhs_factor`: Retrieves the Daniel-Hirshleifer-Sun Behavioural factors.
21
+ - `icr_factors`: Retrieves the He, Kelly, Manela (2017) ICR factors.
22
+ - `hml_devil_factors`: Retrieves the HML Devil factors from AQR.
23
+ - `barillas_shanken_factors`: Constructs the 6-factor model of Barillas and
24
+ Shanken.
25
+ - `carhart_factors`: Retrieves the Carhart 4-factor model data.
26
+ - `liquidity_factors`: Retrieves the Pastor-Stambaugh liquidity factors.
27
+ - `mispricing_factors`: Retrieves the Stambaugh-Yuan (201x) mispricing factors.
28
+ """
29
+ import os
30
+ import pickle
31
+ from io import BytesIO
32
+ from typing import Optional, Union
33
+ import numpy as np
34
+ import pandas as pd
35
+ import requests
36
+ from getfactormodels.utils.utils import _process, get_file_from_url
37
+ from .ff_models import _get_ff_factors
38
+
39
+
40
+ def ff_factors(model: str = "3", # TODO: fix: _get_ff_factors filepath param
41
+ frequency: str = "M",
42
+ start_date: str = None,
43
+ end_date: str = None,
44
+ output: str = None) -> pd.DataFrame:
45
+ """Get data for a specified Fama-French or Carhart factor model.
46
+
47
+ This function returns a DataFrame containing the 3-factor (1993), 5-factor
48
+ (2015), or 6-factor (2018) model of Fama & French, or Carhart's (1997)
49
+ 4-factor model. Data is available in daily, weekly, monthly, and annual
50
+ frequencies. If an output is specified, saves the data to a file.
51
+
52
+ Notes:
53
+ - Only the 3-factor model offers weekly data.
54
+ - Dates should be in ``YYYY-MM-DD`` format, but anything that
55
+ ``dateutil.parser.parse()`` can interpret will work.
56
+
57
+ Parameters:
58
+ model (str, int): the Fama-French or Carhart factor data to return. 3, 4, 5
59
+ or 6 (default: 3).
60
+ frequency (str): the frequency of the data. Accepts D, W, M or Y
61
+ (default: M).
62
+ start_date (str, optional): the start date of the data, as YYYY-MM-DD.
63
+ end_date (str, optional): the end date of the data, as YYYY-MM-DD.
64
+ output (str, optional): a filename, directory, or filepath. If no
65
+ extension is provided, will output a '.csv'. Accepts '.txt',
66
+ '.csv', '.md', '.xlsx', '.pkl'.
67
+
68
+ Returns:
69
+ pandas.DataFrame: factor data, indexed by date.
70
+ """
71
+ model = str(model)
72
+ data = _get_ff_factors(model, frequency, start_date, end_date)
73
+ return _process(data, start_date, end_date, filepath=output)
74
+
75
+
76
+ def liquidity_factors(frequency: str = "M",
77
+ start_date: str = None,
78
+ end_date: str = None,
79
+ output: str = None) -> pd.DataFrame:
80
+ """Retrieve the Pastor-Stambaugh liquidity factors. Monthly data only."""
81
+ url = 'https://research.chicagobooth.edu/'
82
+ url += '-/media/research/famamiller/data/liq_data_1962_2022.txt'
83
+
84
+ if frequency.lower() != 'm':
85
+ print('Liquidity factors are only available for monthly frequency.')
86
+ raise ValueError("Frequency must be 'm'.")
87
+
88
+ # Get .csv here...
89
+ data = get_file_from_url(url)
90
+
91
+ # Headers are last commented line
92
+ headers = [line[1:].strip().split('\t')
93
+ for line in data.readlines() if line.startswith('%')][-1]
94
+
95
+ # Fix: was losing first line of data
96
+ data.seek(0)
97
+
98
+ # ...read .csv here
99
+ data = pd.read_csv(data, sep='\\s+', names=headers,
100
+ comment='%', index_col=0)
101
+
102
+ data.index.name = 'date'
103
+ data.index = data.index.astype(str)
104
+
105
+ data = data.rename(columns={'Agg Liq.': 'AGG_LIQ',
106
+ 'Innov Liq (eq8)': 'INNOV_LIQ',
107
+ 'Traded Liq (LIQ_V)': 'TRADED_LIQ'})
108
+
109
+ # The first 65 values in the traded liquidity series are -99.000000.
110
+ data['TRADED_LIQ'] = data['TRADED_LIQ'].replace(-99.000000, 0)
111
+
112
+ if frequency.lower() == 'm':
113
+ data.index = pd.to_datetime(data.index, format='%Y%m') \
114
+ + pd.offsets.MonthEnd(0)
115
+
116
+ return _process(data, start_date, end_date, filepath=output)
117
+
118
+
119
+ def mispricing_factors(frequency: str = "M",
120
+ start_date: str = None,
121
+ end_date: str = None,
122
+ output: str = None) -> pd.DataFrame:
123
+ """Retrieve the Stambaugh-Yuan mispricing factors. Daily and monthly."""
124
+ if frequency.lower() not in ["d", "m"]:
125
+ print("Mispricing factors are only available for daily and monthly \
126
+ frequency.")
127
+ raise ValueError("Frequency must be 'd' or 'm'.")
128
+ return None
129
+
130
+ file = "M4d" if frequency == "d" else "M4"
131
+ url = f"https://finance.wharton.upenn.edu/~stambaug/{file}.csv"
132
+
133
+ data = get_file_from_url(url)
134
+
135
+ data = pd.read_csv(data, index_col=0, parse_dates=False,
136
+ date_format="%Y%m%d", engine="pyarrow") # only model
137
+ # using pyarrow? # noqa
138
+
139
+ data = data.rename(columns={"SMB": "SMB_SY",
140
+ "MKTRF": "Mkt-RF"}).rename_axis("date")
141
+
142
+ if frequency == "d":
143
+ data.index = pd.to_datetime(data.index, format="%Y%m%d")
144
+ elif frequency == "m":
145
+ data.index = pd.to_datetime(data.index, format="%Y%m")
146
+ data.index = data.index + pd.offsets.MonthEnd(0)
147
+
148
+ return _process(data, start_date, end_date, filepath=output)
149
+
150
+
151
+ def q_factors(frequency: str = "M",
152
+ start_date: str = None,
153
+ end_date: str = None,
154
+ output: str = None,
155
+ classic: bool = False) -> pd.DataFrame:
156
+ """Retrieve the q-factor model data."""
157
+ frequency = frequency.upper()
158
+ file = {"M": "monthly",
159
+ "D": "daily",
160
+ "Q": "quarterly",
161
+ "W": "weekly",
162
+ "Y": "annual", }.get(frequency)
163
+
164
+ base_url = 'https://global-q.org/uploads'
165
+ url = f"{base_url}/1/2/2/6/122679606/q5_factors_{file}_2022.csv"
166
+
167
+ index_cols = [0, 1] if frequency in ["M", "Q"] else [0]
168
+ data = pd.read_csv(
169
+ url, parse_dates=False, index_col=index_cols, float_precision="high")
170
+
171
+ if classic:
172
+ data = data.drop(columns=["R_EG"])
173
+
174
+ data = data.rename(columns={"R_F": "RF"})
175
+
176
+ data = np.multiply(data, 0.01)
177
+
178
+ if frequency in ["M", "Q"]:
179
+ # Need to insert "-" (monthly) or "Q" (quarterly) into date str.
180
+ data = data.reset_index()
181
+ col = "quarter" if frequency == "Q" else "month"
182
+ char = "Q" if frequency == "Q" else "-"
183
+
184
+ data["date"] = pd.PeriodIndex(
185
+ data["year"].astype(str)
186
+ + char
187
+ + data[col].astype(str), freq=frequency
188
+ ).to_timestamp(how="end")
189
+
190
+ data["date"] = data["date"].dt.normalize()
191
+ data = data.drop(["year", col], axis=1).set_index("date")
192
+
193
+ if frequency == "Y":
194
+ data.index = pd.to_datetime(data.index.astype(str)) \
195
+ + pd.offsets.YearEnd(0)
196
+ else:
197
+ data.index = pd.to_datetime(data.index.astype(str))
198
+
199
+ data.columns = data.columns.str.upper()
200
+ data.index.name = "date"
201
+ data = data.rename(columns={"R_MKT": "Mkt-RF"})
202
+
203
+ return _process(data, start_date, end_date, filepath=output)
204
+
205
+
206
+ # Daniel-Hirshleifer-Sun Behavioural Factors
207
+ def dhs_factors(frequency: str = "M",
208
+ start_date: str = None,
209
+ end_date: str = None,
210
+ output: str = None) -> pd.DataFrame:
211
+ """Retrieve DHS factors from sheets on Lin Sun's website."""
212
+ frequency = frequency.lower()
213
+ base_url = "https://docs.google.com/spreadsheets/d/"
214
+
215
+ if frequency.lower() == "m":
216
+ file = "1RxYLbCfk19m8fnniiJYfaj3yI55ZPaoi/export?format=xlsx"
217
+ elif frequency.lower() == "d":
218
+ file = "1KnCP-NVhf2Sni8bVFIVyMxW-vIljBOWE/export?format=xlsx"
219
+ else:
220
+ print("Frequency must be either 'M' (monthly) or 'D' (daily).")
221
+ raise ValueError("Frequency must be 'M' or 'D'.")
222
+ # TODO: use the link to the Google Sheet instead of the actual sheet.
223
+
224
+ url = base_url + file
225
+
226
+ response = requests.get(url, verify=True, timeout=20)
227
+ file = BytesIO(response.content)
228
+
229
+ data = pd.read_excel(file, index_col="Date",
230
+ usecols=['Date', 'FIN', 'PEAD'], engine='openpyxl',
231
+ header=0, parse_dates=False)
232
+ data.index.name = "date"
233
+
234
+ if frequency.lower() == "d":
235
+ data.index = pd.to_datetime(data.index, format="%m/%d/%Y")
236
+ else:
237
+ data.index = pd.to_datetime(data.index, format="%Y%m")
238
+ data.index = data.index + pd.offsets.MonthEnd(0)
239
+
240
+ data = np.multiply(data, 0.01) # Decimalize before FF factors!
241
+
242
+ ff = _get_ff_factors(model="3", frequency=frequency,
243
+ start_date=data.index[0], end_date=data.index[-1])
244
+ ff = ff.round(4)
245
+ # Note: FF source data is to 4 decimals; re-rounding here to avoid
246
+ # rounding errors (e.g., 0.02 --> 0.019999999999999997)
247
+ data = pd.concat([ff["Mkt-RF"], data, ff["RF"]], axis=1)
248
+ data.index.name = "date"
249
+
250
+ return _process(data, start_date, end_date, filepath=output)
251
+
252
+
253
+ def icr_factors(frequency: str = "M",
254
+ start_date: str = None,
255
+ end_date: str = None,
256
+ output: str = None) -> pd.DataFrame:
257
+ """Retrieve the He, Kelly, Manela (2017) ICR factors.
258
+ * Daily since 1999-05-03; quarterly and monthly since 1970.
259
+ """
260
+ # TODO: Do we need Mkt-RF and RF [seen reffered to as 2-factor model]?
261
+ frequency = frequency.lower()
262
+
263
+ if frequency not in ["d", "m", "q"]:
264
+ raise ValueError("Frequency must be 'd', 'm' or 'q'.")
265
+
266
+ base_url = "https://voices.uchicago.edu/zhiguohe"
267
+ file = {"d": "daily", "m": "monthly", "q": "quarterly"}.get(frequency)
268
+ url = f"{base_url}/files/2023/10/He_Kelly_Manela_Factors_{file}.csv"
269
+
270
+ df = get_file_from_url(url)
271
+ df = pd.read_csv(df)
272
+ df = df.rename(columns={df.columns[0]: "date"})
273
+
274
+ # Just doing dates here for now...
275
+ if frequency == "q":
276
+ # The dates are YYYYQ. [19752 -> 1975Q2]
277
+ df["date"] = df["date"].astype(str)
278
+ df["date"] = df["date"].str[:-1] + "Q" + df["date"].str[-1]
279
+ df["date"] = pd.PeriodIndex(df["date"], freq="Q").to_timestamp() \
280
+ + pd.offsets.QuarterEnd(0)
281
+
282
+ df = df.rename(columns={
283
+ "intermediary_capital_ratio": "IC_RATIO",
284
+ "intermediary_capital_risk_factor": "IC_RISK_FACTOR",
285
+ "intermediary_leverage_ratio_squared": "INT_LEV_RATIO_SQ",
286
+ "intermediary_value_weighted_investment_return": "INT_VW_ROI", })
287
+
288
+ if frequency == "m":
289
+ df["date"] = pd.to_datetime(df["date"], format="%Y%m")
290
+ df["date"] = df["date"] + pd.offsets.MonthEnd(0)
291
+
292
+ elif frequency == "d":
293
+ df["date"] = pd.to_datetime(df["date"], format="%Y%m%d")
294
+
295
+ df = df.set_index("date")
296
+
297
+ return _process(df, start_date, end_date, filepath=output)
298
+
299
+
300
+ def q_classic_factors(frequency: str = "M",
301
+ start_date: str = None,
302
+ end_date: str = None,
303
+ output: str = None) -> pd.DataFrame:
304
+ """Retrieve the classic q-factor model of Hou, Xue, and Zhang (2015)."""
305
+ return q_factors(frequency, start_date, end_date, output=output,
306
+ classic=True)
307
+
308
+
309
+ def carhart_factors(frequency: str = "M",
310
+ start_date: str = None,
311
+ end_date: str = None,
312
+ output: str = None) -> pd.DataFrame:
313
+ """Retrieve the Carhart 4-factor model data."""
314
+ data = _get_ff_factors(model='4', frequency=frequency,
315
+ start_date=start_date,
316
+ end_date=end_date)
317
+ return _process(data, start_date, end_date, filepath=output)
318
+
319
+
320
+ def _create_cache():
321
+ cache_dir = os.path.expanduser('~/.cache/getfactormodels')
322
+ if not os.path.exists(cache_dir):
323
+ os.makedirs(cache_dir)
324
+
325
+
326
+ def hml_devil_factors(frequency='M',
327
+ start_date: Optional[str] = None,
328
+ end_date: Optional[str] = None,
329
+ output: Optional[str] = None,
330
+ series=False) -> Union[pd.Series, pd.DataFrame]:
331
+ """***EXPERIMENTAL***
332
+
333
+ Retrieve the HML Devil factors from AQR.com. [FIXME: Slow.]
334
+
335
+ Notes:
336
+ - Slow. Very slow. So we implement a cache and it doesn't need to run
337
+ again until tomorrow (daily) or next month.
338
+
339
+ Parameters:
340
+ frequency (str): The frequency of the data. M, D (default: M)
341
+ start_date (str, optional): The start date of the data, YYYY-MM-DD.
342
+ end_date (str, optional): The end date of the data, YYYY-MM-DD.
343
+ output (str, optional): The filepath to save the output data.
344
+ series (bool, optional): If True, return the HML Devil factors as a
345
+ pandas Series.
346
+
347
+ Returns:
348
+ pd.DataFrame: the HML Devil model data indexed by date.
349
+ pd.Series: the HML factor as a pd.Series
350
+ """
351
+ _create_cache() # TODO: allow config and cache_file [not pickle]
352
+ pickle_file = os.path.expanduser(f'~/.cache/getfactormodels/hml_devil_{frequency}.pkl') # noqa
353
+
354
+ # Get the current date and the date of the file creation
355
+ current_date = pd.to_datetime('today')
356
+ if os.path.exists(pickle_file):
357
+ file_date = pd.to_datetime(os.path.getmtime(pickle_file), unit='s')
358
+
359
+ # If the pickle file exists and is not expired, load the df from it
360
+ if (frequency.lower() == 'd' and file_date.day == current_date.day) or \
361
+ (frequency.lower() != 'd' and file_date.month == current_date.month): # noqa
362
+ with open(pickle_file, 'rb') as f:
363
+ if series:
364
+ return _process(pickle.load(f), # read pickle instead, csv ? # noqa
365
+ start_date, end_date).HML_DEVIL
366
+ else:
367
+ data = _process(pickle.load(f), start_date, end_date)
368
+ data = data.dropna()
369
+ return data
370
+ # If the pickle file is expired, delete it
371
+ else:
372
+ os.remove(pickle_file)
373
+
374
+ base_url = 'https://www.aqr.com/-/media/AQR/Documents/Insights/'
375
+ file = 'daily' if frequency.lower() == 'd' else 'monthly'
376
+ url = f'{base_url}/Data-Sets/The-Devil-in-HMLs-Details-Factors-{file}.xlsx'
377
+
378
+ print('Downloading HML Devil factors from AQR... This can take a while. Please be patient or something.') # noqa
379
+
380
+ # TODO: A progress bar until something's figured out? download_with_progress # noqa
381
+ # TODO: Handle interupts SIGINT, etc.
382
+ response = requests.get(url, verify=True, timeout=180)
383
+ xls = pd.ExcelFile(BytesIO(response.content))
384
+
385
+ sheets = {0: 'HML Devil', 4: 'MKT', 5: 'SMB', 7: 'UMD', 8: 'RF'}
386
+ dfs = []
387
+
388
+ df_dict = pd.read_excel(xls,
389
+ sheet_name=list(sheets.values()),
390
+ skiprows=18,
391
+ header=0,
392
+ index_col=0,
393
+ parse_dates=True)
394
+
395
+ for sheet_index, sheet_name in sheets.items():
396
+ df = df_dict[sheet_name]
397
+
398
+ # Use 'USA' col, except for the RF sheet, use cols first two cols
399
+ df = df[['USA']] if sheet_index != 8 else df.iloc[:, 0:1]
400
+ # TODO: allow for other countries
401
+
402
+ df.columns = [sheet_name]
403
+ dfs.append(df)
404
+
405
+ data = pd.concat(dfs, axis=1)
406
+ data.rename(columns={'MKT': 'Mkt-RF',
407
+ 'HML Devil': 'HML_DEVIL'}, inplace=True)
408
+ data = data.astype(float)
409
+
410
+ with open(pickle_file, 'wb') as f:
411
+ pickle.dump(data, f)
412
+
413
+ if os.path.exists('hml_devil.csv'):
414
+ os.remove('hml_devil.csv')
415
+
416
+ data.index.name = 'date'
417
+
418
+ data.index = pd.to_datetime(data.index)
419
+
420
+ if frequency.lower() == 'd':
421
+ data = data.dropna()
422
+
423
+ if series:
424
+ return _process(data, start_date, end_date, filepath=output).HML_DEVIL
425
+
426
+ return _process(data, start_date, end_date, filepath=output)
427
+
428
+
429
+ def barillas_shanken_factors(frequency: str = 'M',
430
+ start_date: str = None,
431
+ end_date: str = None,
432
+ output: str = None) -> pd.DataFrame:
433
+ """***Experimental.***
434
+
435
+ Constructs the 6-factor model of Barillas and Shanken. It's a
436
+ combination of the 5-factor model of Fama and French (2015), the q-factor
437
+ model of Hou, Xue, and Zhang (2015), and Asness and Frazzini's HML Devil.
438
+ This is the factor model with the highest posterior inclusion probability
439
+ in Barillas and Shanken (2018).
440
+
441
+ Note:
442
+ - Relies on the HML Devil factors being retrieved (which is very slow).
443
+
444
+ Returns:
445
+ pd.DataFrame: A timeseries of the factor data.
446
+ """
447
+ q = q_factors(frequency=frequency, classic=True)[['R_IA', 'R_ROE']]
448
+ ff = ff_factors(model='6', frequency=frequency)[['Mkt-RF', 'SMB', 'UMD',
449
+ 'RF']]
450
+
451
+ df = pd.merge(q, ff, left_index=True, right_index=True, how='inner')
452
+
453
+ hml_devil = hml_devil_factors(frequency=frequency, start_date=start_date,
454
+ series=True)
455
+
456
+ hml_devil = hml_devil.rename('HML_m')
457
+ hml_devil.index.name = 'date'
458
+
459
+ df = pd.merge(df, hml_devil, left_index=True,
460
+ right_index=True, how='inner')
461
+
462
+ return _process(df, start_date, end_date, filepath=output)
File without changes
@@ -0,0 +1,28 @@
1
+ # -*- coding: utf-8 -*-
2
+ import argparse
3
+
4
+
5
+ def parse_args():
6
+ """Argument parser, allowing for command line arguments.
7
+ This is the function used in pyproject.toml to run the CLI."""
8
+ parser = argparse.ArgumentParser(
9
+ description='Retrieve and structure data for factor models.',
10
+ formatter_class=argparse.RawDescriptionHelpFormatter,
11
+ epilog='''Example usage:
12
+ python main.py -m 3 -f M -s 1961-01-01 -e 1990-12-31
13
+ python main.py --model icr --frequency M --end 1990-12-31 --no_rf -o '~/icr.csv' ''' # noqa
14
+ )
15
+ parser.add_argument('-m', '--model', type=str, required=True,
16
+ help='The model to use.')
17
+ parser.add_argument('-f', '--freq', '--frequency', type=str,
18
+ required=False, default='M', help='The frequency of\
19
+ the data. Valid options are D, W, M, Q, A.')
20
+ parser.add_argument('-s', '--start', type=str, required=False,
21
+ help='The start date for the data.')
22
+ parser.add_argument('-e', '--end', type=str, required=False,
23
+ help='The end date for the data.')
24
+ parser.add_argument('-o', '--output', type=str, required=False, # noqa
25
+ help='The file to save the data to.')
26
+ parser.add_argument('--no_rf', '--no-rf', '--norf', action='store_true',
27
+ help='Drop the RF column from the DataFrame.')
28
+ return parser.parse_args()
@@ -0,0 +1,174 @@
1
+ # -*- coding: utf-8 -*-
2
+ import re
3
+ import zipfile as zip
4
+ from datetime import datetime
5
+ from io import BytesIO, StringIO
6
+ from pathlib import Path
7
+ from types import MappingProxyType
8
+ import pandas as pd
9
+ import requests
10
+ from dateutil import parser
11
+
12
+ __model_input_map = MappingProxyType({
13
+ "3": r"\b((f?)f)?3\b|(ff)?1993",
14
+ "5": r"\b(ff)?5|ff2015\b",
15
+ "4": r"\b(c(ar(hart)?)?4?|ff4|carhart1997|4)\b",
16
+ "6": r"\b(ff)?6|ff2018\b",
17
+ "q": r"\b(q(5)?|hmxz)\b",
18
+ "q_classic": r"\b(q4|q(_)?classic)|classic_q\b",
19
+ "mispricing": r"\b(sy4?|mispricing)|misp|yuan$|m4|mis|sy\b",
20
+ "liquidity": r"^(il)?liq(uidity)?|(pastor|ps|sp)$",
21
+ "icr": r"\bicr|hkm\b",
22
+ "dhs": r"^(\bdhs\b|behav.*)$",
23
+ "hml_devil": r"\bhml(_)?d(evil)?\b",
24
+ "barillas_shanken": r"\b(bs|bs6|barillas|shanken)\b", })
25
+
26
+
27
+ def _get_model_key(model):
28
+ """
29
+ Convert a model name to a model key.
30
+ * This provides more flexibility in input by converting various model names
31
+ to a standardized model key.
32
+
33
+ >>> _get_model_key('ff1993')
34
+ '3'
35
+ >>> _get_model_key('liQ')
36
+ 'liquidity'
37
+ >>> _get_model_key('q4_factors')
38
+ 'q_classic'
39
+ >>> _get_model_key('ICR')
40
+ 'icr'
41
+ """
42
+ model = str(model)
43
+
44
+ for key, regex in __model_input_map.items():
45
+ if re.match(regex, model, re.I):
46
+ return key
47
+ raise ValueError(f'Invalid model: {model}')
48
+
49
+
50
+ def get_file_from_url(url):
51
+ """Get a file from a URL and return its content as a StringIO object."""
52
+ response = requests.get(url, verify=True, timeout=15)
53
+ response.raise_for_status()
54
+ response_content = response.content.decode('utf-8')
55
+ content = StringIO(response_content)
56
+ return content
57
+
58
+
59
+ def get_zip_from_url(url):
60
+ """Download a zip file from a URL and return a ZipFile object."""
61
+ try:
62
+ response = requests.get(url, timeout=15)
63
+ response.raise_for_status()
64
+ content = response.content
65
+ except (KeyboardInterrupt, Exception) as e:
66
+ print(f"An error occurred downloading the zip file from {url}: {e}")
67
+ raise
68
+
69
+ return zip.ZipFile(BytesIO(content))
70
+
71
+
72
+ def _save_to_file(data, filename=None, output_dir=None):
73
+ """Save a pandas dataFrame to a file."""
74
+ if isinstance(data, (pd.DataFrame, pd.Series)):
75
+ formats = {
76
+ '.txt': lambda filename: data.to_csv(filename, sep='\t'),
77
+ '.csv': data.to_csv,
78
+ '.xlsx': data.to_excel, # TODO: style with writer
79
+ '.pkl': data.to_pickle,
80
+ '.md': data.to_markdown, }
81
+
82
+ if filename is None:
83
+ filename = datetime.now().strftime('%Y-%m-%d') + '.csv'
84
+ elif '.' not in filename:
85
+ filename += '.csv'
86
+
87
+ # If no output directory is provided, use the current working
88
+ # directory
89
+ if output_dir is None:
90
+ output_dir = Path.cwd()
91
+ else:
92
+ # Expand the '~' character in the output directory
93
+ output_dir = Path(output_dir).expanduser()
94
+
95
+ # Create the full file path
96
+ filename = output_dir / filename
97
+
98
+ # Check if file exists
99
+ if filename.is_file():
100
+ print('File exists: overwriting...')
101
+
102
+ for ext, func in formats.items():
103
+ if str(filename).endswith(ext):
104
+ func(str(filename))
105
+ print(f"File saved to: {filename}")
106
+ break
107
+
108
+ else:
109
+ raise ValueError('Unsupported file extension')
110
+ else:
111
+ raise ValueError('Data is not a pandas DataFrame or Series')
112
+
113
+
114
+ def _rearrange_cols(data):
115
+ """Rearrange the columns of the dataframe.
116
+ * NOTE: this is faster:
117
+ cols = data.columns.values
118
+ cols_order = np.concatenate(([np.where(cols == 'Mkt-RF')[0], \
119
+ np.where((cols != 'Mkt-RF') & (cols != 'RF'))[0], \
120
+ np.where(cols == 'RF')[0]]))
121
+ return data.iloc[:, cols_order]
122
+ """
123
+ # [TODO] ICR model has no RF or Mkt Excess return column
124
+ if isinstance(data, pd.Series):
125
+ return data
126
+ cols = list(data.columns)
127
+ if 'Mkt-RF' in cols:
128
+ cols.insert(0, cols.pop(cols.index('Mkt-RF')))
129
+ if 'RF' in cols:
130
+ cols.append(cols.pop(cols.index('RF')))
131
+ return data.loc[:, cols]
132
+
133
+
134
+ def _validate_date(date_str):
135
+ """Use `dateutil.parser.parse` to validate a date format."""
136
+ if date_str is None:
137
+ return None
138
+ if isinstance(date_str, pd.Timestamp):
139
+ return date_str.strftime("%Y-%m-%d")
140
+ try:
141
+ return parser.parse(date_str).strftime("%Y-%m-%d")
142
+ except ValueError as err:
143
+ raise ValueError("Incorrect date format, use YYYY-MM-DD.") from err
144
+
145
+
146
+ def _slice_dates(data, start_date=None, end_date=None):
147
+ """Slice the dataframe to the specified date range."""
148
+ if start_date is None and end_date is None:
149
+ return data
150
+
151
+ if start_date is not None:
152
+ start_date = _validate_date(start_date)
153
+ if end_date is not None:
154
+ end_date = _validate_date(end_date)
155
+
156
+ return data.loc[slice(start_date, end_date)]
157
+
158
+
159
+ def _process(data, start_date=None, end_date=None, filepath=None):
160
+ """Process the data and optionally save it to a file.
161
+ * filepath: takes a filename, path or directory.
162
+ """
163
+ data = _rearrange_cols(data)
164
+ data = _slice_dates(data, start_date, end_date)
165
+
166
+ if filepath:
167
+ # Convert the filepath to a Path object and expand the '~' character
168
+ filepath = Path(filepath).expanduser()
169
+
170
+ dir_path, filename = filepath.parent, filepath.name
171
+
172
+ _save_to_file(data, filename, dir_path)
173
+
174
+ return data
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023 S. Martin
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,234 @@
1
+ Metadata-Version: 2.1
2
+ Name: getfactormodels
3
+ Version: 0.0.1
4
+ Summary: Retreive data for various multifactor asset pricing models.
5
+ Keywords: finance,pricing models,financial analysis,econometrics,asset pricing,multifactor models
6
+ Author-email: "S. Martin" <x512@pm.me>
7
+ Requires-Python: >=3.7
8
+ Description-Content-Type: text/markdown
9
+ Classifier: Topic :: Office/Business :: Financial
10
+ Classifier: Topic :: Office/Business :: Financial :: Investment
11
+ Classifier: Topic :: Scientific/Engineering :: Mathematics
12
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3 :: Only
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Environment :: Console
17
+ Classifier: License :: OSI Approved :: MIT License
18
+ Classifier: Intended Audience :: Financial and Insurance Industry
19
+ Classifier: Development Status :: 2 - Pre-Alpha
20
+ Requires-Dist: numpy >=1.18.5
21
+ Requires-Dist: pandas >=1.4
22
+ Requires-Dist: requests >=2.20.0
23
+ Requires-Dist: pyarrow >=14.0.1
24
+ Requires-Dist: openpyxl >=3.0.3
25
+ Requires-Dist: tabulate >=0.8.7
26
+ Requires-Dist: ruff ; extra == "dev"
27
+ Requires-Dist: pytest-cov ; extra == "dev"
28
+ Requires-Dist: pytest ; extra == "dev"
29
+ Requires-Dist: pytest-randomly ; extra == "dev"
30
+ Requires-Dist: isort ; extra == "dev"
31
+ Requires-Dist: nox ; extra == "dev"
32
+ Project-URL: Homepage, https://github.com/x512/getfactormodels
33
+ Provides-Extra: dev
34
+
35
+ <a name="readme-top"></a>
36
+
37
+ # getfactormodels
38
+
39
+ ![Python 3.11](https://img.shields.io/badge/Python-3.7+-306998.svg?logo=python&logoColor=ffde57&style=flat-square)
40
+
41
+
42
+ Reliably retrieve data for various multi-factor asset pricing models.
43
+
44
+ ## Models
45
+
46
+ - The 3-factor, 5-factor, and 6-factor models of Fama & French <sup>[[1]](#1) [[3]](#3) [[4]](#4)</sup>
47
+ - Mark Carhart's 4-factor model <sup>[[2]](#2)</sup>
48
+ - Pastor and Stambaugh's liquidity factors <sup>[[5]](#5)</sup>
49
+ - Mispricing factors of Stambaugh and Yuan<sup>[[6]](#6)</sup>
50
+ - The $q$*-factor* model of Hou, Mo, Xue and Zhang<sup>[[7]](#7)</sup>
51
+ - The augmented $q^5$*-factor* model of Hou, Mo, Xue and Zhang<sup>[[8]](#8)</sup>
52
+ - *Intermediary Capital Ratio* (ICR) of He, Kelly & Manela<sup>[[9]](#9)</sup>
53
+ - The *DHS behavioural factors* of Daniel, Hirshleifer & Sun<sup>[[10]](#10)</sup>
54
+ - The *HML* $^{DEVIL}$ factor of Asness & Frazzini<sup>[[11]](#11)</sup>
55
+ - The 6-factor model of Barillas and Shanken<sup>[[12]](#12)</sup>
56
+
57
+
58
+ _Thanks to: Kenneth French, Robert Stambaugh, Lin Sun, Zhiguo He, AQR Capital Management (AQR.com) and Hou, Xue and Zhang (global-q.org), for their research and for the datasets they publically provide._
59
+
60
+
61
+ ## Installation
62
+
63
+ `getfactormodels` requires Python ``>=3.7``
64
+
65
+ * Install with pip:
66
+ ```shell
67
+ $ pip install getfactormodels
68
+ ```
69
+
70
+ ## Usage
71
+
72
+ #### Python
73
+
74
+ After installing, import ``getfactormodels`` and call ``get_factors()`` with the ``model`` and ``frequency`` parameters. Optionally, specify a ``start_date`` and ``end_date``
75
+ * For example, to retrieve the daily q-factor model data:
76
+
77
+ ```py
78
+ import getfactormodels as getfactormodels
79
+
80
+ df = getfactormodels.get_factors(model='q', frequency='d')
81
+ ```
82
+ > _Trimmed output:_
83
+ ```txt
84
+ > df
85
+ Mkt-RF R_ME R_IA R_ROE R_EG RF
86
+ date
87
+ 1967-01-03 0.000778 0.004944 0.001437 -0.007118 -0.008563 0.000187
88
+ 1967-01-04 0.001667 -0.003487 -0.000631 -0.002044 -0.000295 0.000187
89
+ 1967-01-05 0.012990 0.004412 -0.005688 0.000838 -0.003075 0.000187
90
+ 1967-01-06 0.007230 0.006669 0.008897 0.003603 0.002669 0.000187
91
+ 1967-01-09 0.008439 0.006315 0.000331 0.004949 0.002979 0.000187
92
+ ... ... ... ... ... ... ...
93
+ 2022-12-23 0.005113 -0.001045 0.004000 0.010484 0.003852 0.000161
94
+ 2022-12-27 -0.005076 -0.001407 0.010190 0.009206 0.003908 0.000161
95
+ 2022-12-28 -0.012344 -0.004354 0.000133 -0.010457 -0.004953 0.000161
96
+ 2022-12-29 0.018699 0.008568 -0.008801 -0.012686 -0.002162 0.000161
97
+ 2022-12-30 -0.002169 0.001840 0.001011 -0.004151 -0.003282 0.000161
98
+
99
+ [14096 rows x 6 columns]
100
+ ```
101
+
102
+ * or, retreive the monthly liquidity factors of Pastor and Stambaugh for the 1990s:
103
+
104
+ ```py
105
+ import getfactormodels as getfactormodels
106
+
107
+ df = getfactormodels.get_factors(model='liquidity', frequency='m', start_date='1990-01-01', end_date='1999-12-31')
108
+ ```
109
+ > If you don't have time to type `liquidity`, type `liq`, or `ps`--there's a handy regex.
110
+
111
+ * or, saving the monthly 3-factor model of Fama & French to a file:
112
+
113
+ ```py
114
+ import getfactormodels as gfm
115
+
116
+ df = gfm.get_factors(model='ff3', frequency='m', output="ff3_data.csv")
117
+ ```
118
+ >The output parameter accepts a filename, path or directory, and can be one of csv, md, txt, xlsx, pkl.
119
+
120
+
121
+ You can also import just the models that you need.
122
+
123
+ * For example, to import only the *ICR* and *q*-factor models:
124
+
125
+ ```py
126
+ from getfactormodels import icr_factors, q_factors
127
+
128
+ # Passing a model function with no params defaults to monthly.
129
+ df = icr_factors()
130
+
131
+ # The 'q' models, and the 3-factor model of Fama-French also have weekly data.
132
+ df = q_factors(frequency="W", start_date="1992-01-01)
133
+ ```
134
+
135
+ * If using ``ff_factors()``, then an additional ``model`` parameter should be specified:
136
+
137
+ ```py
138
+ from getfactormodels import ff_factors
139
+
140
+ # To get annual data for the 5-factor model:
141
+ data = ff_factors(model="5", frequency="Y", output=".xlsx")
142
+
143
+ # Daily 3-factor model data, since 1970 (not specifying an end date
144
+ # will return data up until today):
145
+ data = ff_factors(model="3", frequency="D", start_date="1970-01-01")
146
+ ```
147
+ > Output allows just an extension to be specified.
148
+
149
+ * or import all the models:
150
+
151
+ ```py
152
+ from getfactormodels import models
153
+ ```
154
+
155
+ * There's also the `FactorExtractor` class that the CLI uses (it doesn't really do a whole lot yet):
156
+
157
+ ```python
158
+ from getfactormodels import FactorExtractor
159
+
160
+ fe = FactorExtractor(model='carhart', frequency='m', start_date='1980-01-01', end_date='1980-05-01')
161
+ fe.get_factors()
162
+ fe.to_file('carhart_factors.md')
163
+ ```
164
+
165
+ * _The resulting ``carhart_factors.md`` file will look like this:_
166
+
167
+ | date | Mkt-RF | SMB | HML | MOM | RF |
168
+ |:--------------------|---------:|--------:|--------:|--------:|-------:|
169
+ | 1980-01-31 00:00:00 | 0.0551 | 0.0162 | 0.0175 | 0.0755 | 0.008 |
170
+ | 1980-02-29 00:00:00 | -0.0122 | -0.0185 | 0.0061 | 0.0788 | 0.0089 |
171
+ | 1980-03-31 00:00:00 | -0.129 | -0.0664 | -0.0101 | -0.0955 | 0.0121 |
172
+ | 1980-04-30 00:00:00 | 0.0397 | 0.0105 | 0.0106 | -0.0043 | 0.0126 |
173
+
174
+
175
+ #### Using the CLI
176
+ * You can also use getfactormodels from the command line.
177
+
178
+ ```bash
179
+ $ getfactormodels -h
180
+
181
+ usage: getfactormodels [-h] -m MODEL [-f FREQ] [-s START] [-e END] [-o OUTPUT] [--no_rf]
182
+ ```
183
+
184
+ * An example of how to use the CLI to retrieve the Fama-French 3-factor model data:
185
+ ```bash
186
+ getfactormodels --model ff3 --frequency M --start-date 1960-01-01 --end-date 2020-12-31 --output "filename.csv"
187
+ ```
188
+ > Accepted file extensions are .csv, .txt, .xlsx, and .md. If no extension is given, the output file will be .csv. The --output flag allows a filename, filepath or a directory. If only an extension is provided (including the . else it'll be passed as a filename), a name will be generated.
189
+
190
+ * Here's another example that retrieves the annual Fama-French 5-factor data without the RF column:
191
+
192
+ ```sh
193
+ getfactormodels -m 5 -f Y -s 1960-01-01 -e 2020-12-31 --no_rf -o ~/some_dir/filename.xlsx
194
+ ```
195
+ > `--no_rf` will return the factor model without an RF column.
196
+
197
+ ## References
198
+ 1. <a id="1"></a> E. F. Fama and K. R. French, ‘Common risk factors in the returns on stocks and bonds’, *Journal of Financial Economics*, vol. 33, no. 1, pp. 3–56, 1993. [PDF](https://people.duke.edu/~charvey/Teaching/BA453_2006/FF_Common_risk.pdf)
199
+ 2. <a id="2"></a> M. Carhart, ‘On Persistence in Mutual Fund Performance’, *Journal of Finance*, vol. 52, no. 1, pp. 57–82, 1997. [PDF](https://onlinelibrary.wiley.com/doi/full/10.1111/j.1540-6261.1997.tb03808.x)
200
+ 3. <a id="3"></a> E. F. Fama and K. R. French, ‘A five-factor asset pricing model’, *Journal of Financial Economics*, vol. 116, no. 1, pp. 1–22, 2015. [PDF](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2287202)
201
+ 4. <a id="4"></a> E. F. Fama and K. R. French, ‘Choosing factors’, *Journal of Financial Economics*, vol. 128, no. 2, pp. 234–252, 2018. [PDF](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2668236)
202
+ 5. <a id="5"></a>L. Pastor and R. Stambaugh, ‘Liquidity Risk and Expected Stock Returns’, *Journal of Political Economy*, vol. 111, no. 3, pp. 642–685, 2003. [PDF](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=279804)
203
+ 6. <a id="6"></a>R. F. Stambaugh and Y. Yuan, ‘Mispricing Factors’, *The Review of Financial Studies*, vol. 30, no. 4, pp. 1270–1315, 12 2016. [PDF](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2626701)
204
+ 7. <a id="7"></a>K. Hou, H. Mo, C. Xue, and L. Zhang, ‘Which Factors?’, *National Bureau of Economic Research, Inc*, 2014. [PDF](https://academic.oup.com/rof/article/23/1/1/5133564)
205
+ 8. <a id="8"></a>K. Hou, H. Mo, C. Xue, and L. Zhang, ‘An Augmented q-Factor Model with Expected Growth*’, *Review of Finance*, vol. 25, no. 1, pp. 1–41, 02 2020. [PDF](https://academic.oup.com/rof/article/25/1/1/5727769)
206
+ 9. <a id="9"></a>Z. He, B. Kelly, and A. Manela, ‘Intermediary asset pricing: New evidence from many asset classes’, *Journal of Financial Economics*, vol. 126, no. 1, pp. 1–35, 2017. [PDF](https://cpb-us-w2.wpmucdn.com/voices.uchicago.edu/dist/6/2325/files/2019/12/jfepublishedversion.pdf)
207
+ 10. <a id="10"></a>K. Daniel, D. Hirshleifer, and L. Sun, ‘Short- and Long-Horizon Behavioral Factors’, *Review of Financial Studies*, vol. 33, no. 4, pp. 1673–1736, 2020. [PDF](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3086063)
208
+ 11. <a id="11"></a>C. Asness and A. Frazzini, ‘The Devil in HML’s Details’, *The Journal of Portfolio Management*, vol. 39, pp. 49–68, 2013. [PDF](https://stockmarketmba.com/docs/Asness_Frazzini_AdjustHML.pdf)
209
+ 12. <a id="12"></a>F. Barillas and J. Shanken, ‘Comparing Asset Pricing Models’, *Journal of Finance*, vol. 73, no. 2, pp. 715–754, 2018. [PDF](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2700000)
210
+
211
+ **Data sources:**
212
+
213
+ * K. French, "Data Library," Tuck School of Business at Dartmouth.
214
+ [Link](https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html)
215
+ * R. Stambaugh, "Liquidity" and "Mispricing" factor datasets, Wharton School, University of Pennsylvania.
216
+ [Link](https://finance.wharton.upenn.edu/~stambaug/)
217
+ * Z. He, "Intermediary Capital Ratio and Risk Factor" dataset, University of Chicago.
218
+ [Link](https://voices.uchicago.edu/zhiguohe/data-and-empirical-patterns/intermediary-capital-ratio-and-risk-factor/)
219
+ * K. Hou, G. Xue, R. Zhang, "The Hou-Xue-Zhang q-factors data library," at global-q.org.
220
+ [Link](http://global-q.org/factors.html)
221
+ * AQR Capital Management's Data Sets.
222
+ * Lin Sun, DHS Behavioural factors [Link](https://sites.google.com/view/linsunhome)
223
+
224
+ <p align="right">(<a href="#readme-top">back to top</a>)</p>
225
+
226
+ ## License
227
+
228
+ ![License](https://img.shields.io/badge/MIT-blue?style=for-the-badge&logo=license&colorA=grey&colorB=blue)
229
+
230
+ *The code in this project is released under the [MIT License]().*
231
+
232
+ [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat-square&labelColor=ef8336)](https://pycqa.github.io/isort/)
233
+ [![Ruff](https://img.shields.io/badge/-ruff-%23261230?style=flat-square&logo=ruff&logoColor=d7ff64)](https://simpleicons.org/?q=ruff)
234
+
@@ -0,0 +1,13 @@
1
+ getfactormodels/__init__.py,sha256=OBR7bzlxouVp0fnnNiTPEu2Dg5WT7jY-Ss4AXhpPaQ0,1882
2
+ getfactormodels/__main__.py,sha256=wog0NDpyKmUmFWJ0brIJIPoKN_m0HGXtLiXEuO3PrRs,5565
3
+ getfactormodels/models/__init__.py,sha256=TuTNVPCEwd9xxlAHzoEk4sYDenYQcBdCiMbiulOT-Y0,1237
4
+ getfactormodels/models/ff_models.py,sha256=8ek3Q40acyOuILcbiAJvYfPnyPisY4oPapeRf4w9FzI,4492
5
+ getfactormodels/models/models.py,sha256=VdRWssoqKZKojJyzRj0KstXbLex7WfPU5bgN9NblQoE,17884
6
+ getfactormodels/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ getfactormodels/utils/cli.py,sha256=Gtyuph2HCYgnx-qy6Aq4QRgDJzzxQ_bPX5iG463uJkE,1455
8
+ getfactormodels/utils/utils.py,sha256=G-LcqJLWP-VGt3QlLsmQDwd7fbFK4itlNwpeADZRE_s,5559
9
+ getfactormodels-0.0.1.dist-info/entry_points.txt,sha256=BeSOuEFV8LlnhTxpKpbeJLQfl_kS-bVif0k4Z1ghOnY,65
10
+ getfactormodels-0.0.1.dist-info/LICENSE,sha256=3AA29XMl8p-SVQzn1hMvq478uj3FHVjEUknv6YaGeYk,1066
11
+ getfactormodels-0.0.1.dist-info/WHEEL,sha256=EZbGkh7Ie4PoZfRQ8I0ZuP9VklN_TvcZ6DSE5Uar4z4,81
12
+ getfactormodels-0.0.1.dist-info/METADATA,sha256=KKWEMEIvtRE--mXWonwR8Mwj05zxrKryyRtIwfmzbVg,11895
13
+ getfactormodels-0.0.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: flit 3.9.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ getfactormodels=getfactormodels.__main__:main
3
+