jupyter-data-fetch 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 伍侃
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,90 @@
1
+ Metadata-Version: 2.4
2
+ Name: jupyter-data-fetch
3
+ Version: 0.1.0
4
+ Summary: fetch data from jupyter notebook
5
+ License-File: LICENSE
6
+ Requires-Python: >=3.9
7
+ Requires-Dist: jupyter-kernel-client
8
+ Requires-Dist: pandas<3.0
9
+ Requires-Dist: pillow
10
+ Description-Content-Type: text/markdown
11
+
12
+ # jupyter-data-fetch
13
+
14
+ 从`jupyterlab`、`jupyter notebook`中抓取数据的示例。**代码就一个文件,过于简单,故未打包发布**
15
+
16
+ ## 优点
17
+
18
+ 1. 与`ksrpc`比,通用性更强,理论上全平台通用
19
+ 2. 不需中转服务器,网页能打开就能使用
20
+
21
+ ## 缺点
22
+
23
+ 1. `ksrpc`传输是二进制,而本项目编码成了`base85/base64`,速度较慢
24
+ 2. 传输带宽消耗多,`base64`多占33%,`base85`多占25%
25
+
26
+ ## 安装
27
+
28
+ 1. 将`codec.py`复制到自己的项目中
29
+ 2. `uv pip install -r requirements.txt`,其中关键的是`uv pip install jupyter_kernel_client`库
30
+
31
+ ## 使用方法
32
+
33
+ 1. `examples`下提供了示例
34
+ 2. 以`joinquant`为例,打开浏览器,登录研究环境,按`F12`打开开发者工具
35
+ 3. 搜索`kernels`,复制`请求URL`和`Cookie`
36
+ ![devtool.png](docs/devtool.png)
37
+ 4. 替换示例中`Cookie`和`server_url`即可
38
+ ![ide.png](docs/ide.png)
39
+ 5. 留意:`server_url`只复制一段。`Cookie`要完整复制
40
+
41
+ ## 最简示例
42
+
43
+ ```python
44
+ from jupyter_kernel_client import KernelClient
45
+
46
+ from jupyter_data_fetch.codec import JupyterTextCodec
47
+
48
+ # ... 省去部分代码。更多参考examples/joinquant.py
49
+
50
+ with KernelClient(server_url="https://www.joinquant.com/user/12345678901", token=None, headers=headers) as kernel:
51
+ # 一定要保证缩进正确
52
+ code = """
53
+ df = get_fundamentals(query(
54
+ valuation, income
55
+ ).filter(
56
+ # 这里不能使用 in 操作, 要使用in_()函数
57
+ valuation.code.in_(['000001.XSHE', '600000.XSHG'])
58
+ ), date='2015-10-15')
59
+ """
60
+ reply = kernel.execute(JupyterTextCodec.generate_code(code, var_name='df'))
61
+ # print(reply)
62
+ obj = JupyterTextCodec.extract_decode(reply)
63
+ print(obj)
64
+
65
+ ```
66
+
67
+ ## 进阶函数
68
+
69
+ 1. 由于`code`是字符串,动态传入`list/dict`太麻烦,所以还提供了`auto_execute`
70
+ 2. 用户提前对函数套上`auto_execute`装饰器,就可以快速使用
71
+ 3. 参考[examples/jqresearch.py](examples/jqresearch.py)
72
+
73
+ ## 自动登录并获取数据的完整示例
74
+
75
+ 参考[examples/playwright/joinquant.py](examples/playwright/joinquant.py)
76
+
77
+ ## 核心代码
78
+
79
+ 1. `JupyterTextCodec`: 目前使用`base85`编解码器,使用字符串传输数据,压缩率高。字符串被截断时,必须使用`JupyterImageCodec`
80
+ 2. `JupyterImageCodec`: 图片编解码器,使用图片传输数据,`base64`编码压缩率低
81
+ 3. `generate_code`生成可在`Notebook`单元格中运行的代码字符串,一定要指定需要获取的变量名`var_name`
82
+ 4. `kernel.execute`在服务段执行字符串代码,返回`json`对象
83
+ 5. `extract_decode`从`json`中提取数据后解码成对象
84
+
85
+ ## 注意
86
+
87
+ 1. 由于各平台限制,`generate_code`生成的代码可能无法运行,可以复制到`Notebook`中测试
88
+ 2. `python3.6`问题太多,可以打开一个`ipynb`文件后,通过菜单更改内核为最新版
89
+ 3. 可以连接到已经打开的内核,只要提供`kernel_id`参数即可。参考`ricequant.py`示例
90
+ 4. `Notebook`中可以导入当前目录中`py`,但本项目直接使用当前目录是`/`,导致导入失败,通过指定`kernel_id`可解决
@@ -0,0 +1,79 @@
1
+ # jupyter-data-fetch
2
+
3
+ 从`jupyterlab`、`jupyter notebook`中抓取数据的示例。**代码就一个文件,过于简单,故未打包发布**
4
+
5
+ ## 优点
6
+
7
+ 1. 与`ksrpc`比,通用性更强,理论上全平台通用
8
+ 2. 不需中转服务器,网页能打开就能使用
9
+
10
+ ## 缺点
11
+
12
+ 1. `ksrpc`传输是二进制,而本项目编码成了`base85/base64`,速度较慢
13
+ 2. 传输带宽消耗多,`base64`多占33%,`base85`多占25%
14
+
15
+ ## 安装
16
+
17
+ 1. 将`codec.py`复制到自己的项目中
18
+ 2. `uv pip install -r requirements.txt`,其中关键的是`uv pip install jupyter_kernel_client`库
19
+
20
+ ## 使用方法
21
+
22
+ 1. `examples`下提供了示例
23
+ 2. 以`joinquant`为例,打开浏览器,登录研究环境,按`F12`打开开发者工具
24
+ 3. 搜索`kernels`,复制`请求URL`和`Cookie`
25
+ ![devtool.png](docs/devtool.png)
26
+ 4. 替换示例中`Cookie`和`server_url`即可
27
+ ![ide.png](docs/ide.png)
28
+ 5. 留意:`server_url`只复制一段。`Cookie`要完整复制
29
+
30
+ ## 最简示例
31
+
32
+ ```python
33
+ from jupyter_kernel_client import KernelClient
34
+
35
+ from jupyter_data_fetch.codec import JupyterTextCodec
36
+
37
+ # ... 省去部分代码。更多参考examples/joinquant.py
38
+
39
+ with KernelClient(server_url="https://www.joinquant.com/user/12345678901", token=None, headers=headers) as kernel:
40
+ # 一定要保证缩进正确
41
+ code = """
42
+ df = get_fundamentals(query(
43
+ valuation, income
44
+ ).filter(
45
+ # 这里不能使用 in 操作, 要使用in_()函数
46
+ valuation.code.in_(['000001.XSHE', '600000.XSHG'])
47
+ ), date='2015-10-15')
48
+ """
49
+ reply = kernel.execute(JupyterTextCodec.generate_code(code, var_name='df'))
50
+ # print(reply)
51
+ obj = JupyterTextCodec.extract_decode(reply)
52
+ print(obj)
53
+
54
+ ```
55
+
56
+ ## 进阶函数
57
+
58
+ 1. 由于`code`是字符串,动态传入`list/dict`太麻烦,所以还提供了`auto_execute`
59
+ 2. 用户提前对函数套上`auto_execute`装饰器,就可以快速使用
60
+ 3. 参考[examples/jqresearch.py](examples/jqresearch.py)
61
+
62
+ ## 自动登录并获取数据的完整示例
63
+
64
+ 参考[examples/playwright/joinquant.py](examples/playwright/joinquant.py)
65
+
66
+ ## 核心代码
67
+
68
+ 1. `JupyterTextCodec`: 目前使用`base85`编解码器,使用字符串传输数据,压缩率高。字符串被截断时,必须使用`JupyterImageCodec`
69
+ 2. `JupyterImageCodec`: 图片编解码器,使用图片传输数据,`base64`编码压缩率低
70
+ 3. `generate_code`生成可在`Notebook`单元格中运行的代码字符串,一定要指定需要获取的变量名`var_name`
71
+ 4. `kernel.execute`在服务段执行字符串代码,返回`json`对象
72
+ 5. `extract_decode`从`json`中提取数据后解码成对象
73
+
74
+ ## 注意
75
+
76
+ 1. 由于各平台限制,`generate_code`生成的代码可能无法运行,可以复制到`Notebook`中测试
77
+ 2. `python3.6`问题太多,可以打开一个`ipynb`文件后,通过菜单更改内核为最新版
78
+ 3. 可以连接到已经打开的内核,只要提供`kernel_id`参数即可。参考`ricequant.py`示例
79
+ 4. `Notebook`中可以导入当前目录中`py`,但本项目直接使用当前目录是`/`,导致导入失败,通过指定`kernel_id`可解决
@@ -0,0 +1 @@
1
+ from jupyter_data_fetch._version import __version__
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,191 @@
1
+ import base64
2
+ from io import BytesIO
3
+ from types import SimpleNamespace
4
+
5
+ import pandas as pd
6
+
7
+ BASE_CODE = """
8
+ # 不建议用python3.6
9
+
10
+ from io import BytesIO
11
+ import pandas as pd
12
+
13
+ try:
14
+ buf = BytesIO()
15
+ pd.to_pickle({0}, buf, compression='gzip') # OSError: [Errno 9] write() on read-only GzipFile object
16
+ buf.seek(0)
17
+ compressed = buf.read()
18
+ except OSError:
19
+ import gzip
20
+
21
+ buf = BytesIO()
22
+ pd.to_pickle({0}, buf)
23
+ buf.seek(0) # ValueError: I/O operation on closed file.
24
+ compressed = gzip.compress(buf.getvalue())
25
+ """
26
+
27
+ TO_DICT_CODE = """
28
+ # 客户端无法反序列化的类,转换成字典
29
+ def object_to_dict(obj, exclude=None):
30
+
31
+ exclude = set(exclude or [])
32
+ return {
33
+ attr: getattr(obj, attr)
34
+ for attr in dir(obj)
35
+ if not attr.startswith('_')
36
+ and attr not in exclude
37
+ and not callable(getattr(obj, attr))
38
+ }
39
+ """
40
+
41
+
42
+ def dict_to_object(d, exclude=None):
43
+ """从字典恢复为命名空间对象"""
44
+ exclude = set(exclude or [])
45
+ filtered = {k: v for k, v in d.items() if k not in exclude}
46
+ return SimpleNamespace(**filtered)
47
+
48
+
49
+ def extract_from_reply(reply):
50
+ """print和!,都是走本路径"""
51
+ if reply['status'] == 'error':
52
+ error_msg = '\n'.join(reply['outputs'][0]['traceback'])
53
+ raise RuntimeError(f"Jupyter execution error:\n{error_msg}")
54
+ else:
55
+ return reply['outputs'][0]['text']
56
+
57
+
58
+ class JupyterTextCodec:
59
+ """
60
+ ## 编码
61
+ 1. 数据先pickle序列化
62
+ 2. 使用base85编码。比base64更节省空间
63
+ 3. 通过print输出。部分平台限制了print长度,可以用图片解决
64
+
65
+ ## 解码
66
+ 1. json中提取base85字符
67
+ 2. pickle反序列化
68
+
69
+ """
70
+
71
+ @staticmethod
72
+ def generate_code(*codes, var_name='df'):
73
+ codes_str = '\n'.join(codes)
74
+ return f"""
75
+ {codes_str}
76
+
77
+ {BASE_CODE.format(var_name)}
78
+
79
+ import base64
80
+
81
+ base64.b85encode(compressed).decode('ascii')
82
+ """
83
+
84
+ @staticmethod
85
+ def extract_from_reply(reply):
86
+ if reply['status'] == 'error':
87
+ error_msg = '\n'.join(reply['outputs'][0]['traceback'])
88
+ raise RuntimeError(f"Jupyter execution error:\n{error_msg}")
89
+ else:
90
+ # return reply['outputs'][0]['text']
91
+ return reply['outputs'][0]['data']['text/plain']
92
+
93
+ @staticmethod
94
+ def decode(text):
95
+ text = text[1:-1]
96
+ return pd.read_pickle(BytesIO(base64.b85decode(text)), compression='gzip')
97
+
98
+ @staticmethod
99
+ def extract_decode(reply):
100
+ text = JupyterTextCodec.extract_from_reply(reply)
101
+ return JupyterTextCodec.decode(text)
102
+
103
+
104
+ class JupyterImageCodec:
105
+ """
106
+ ## 编码
107
+ 1. 数据先pickle序列化
108
+ 2. 转换到灰度图
109
+ 3. 利用Notebook展示图片,隐含了base64编码
110
+
111
+ ## 解码
112
+ 1. json中提取图片base64
113
+ 2. base64解码后,打开为图片
114
+ 3. 提取图片数据区
115
+ 4. pickle反序列化
116
+
117
+ """
118
+
119
+ @staticmethod
120
+ def generate_code(*codes, var_name='df'):
121
+ codes_str = '\n'.join(codes)
122
+ return f"""
123
+ {codes_str}
124
+
125
+ {BASE_CODE.format(var_name)}
126
+
127
+ import numpy as np
128
+ from PIL import Image
129
+
130
+ side = int(np.ceil(np.sqrt(len(compressed))))
131
+ padded_data = np.pad(np.frombuffer(compressed, dtype=np.uint8),(0, side * side - len(compressed)),mode='constant')
132
+ img_array = padded_data.reshape(side, side)
133
+
134
+ img = Image.fromarray(img_array, 'L')
135
+ img
136
+ """
137
+
138
+ @staticmethod
139
+ def extract_from_reply(reply):
140
+ if reply['status'] == 'error':
141
+ error_msg = '\n'.join(reply['outputs'][0]['traceback'])
142
+ raise RuntimeError(f"Jupyter execution error:\n{error_msg}")
143
+ else:
144
+ return reply['outputs'][0]['data']['image/png']
145
+
146
+ @staticmethod
147
+ def decode(b64_string):
148
+ import numpy as np
149
+ from PIL import Image
150
+
151
+ img_array = np.array(Image.open(BytesIO(base64.b64decode(b64_string))))
152
+ return pd.read_pickle(BytesIO(img_array), compression='gzip')
153
+
154
+ @staticmethod
155
+ def extract_decode(reply):
156
+ b64_string = JupyterImageCodec.extract_from_reply(reply)
157
+ return JupyterImageCodec.decode(b64_string) # if b64_string else None
158
+
159
+
160
+ # ======================================================================================
161
+ from enum import Enum
162
+
163
+
164
+ class CodecType(Enum):
165
+ TEXT = JupyterTextCodec
166
+ IMAGE = JupyterImageCodec
167
+
168
+
169
+ class LazyKernel:
170
+ _kernel = None
171
+ _codec_type = CodecType.TEXT
172
+
173
+ @classmethod
174
+ def set_kernel(cls, kernel_obj):
175
+ cls._kernel = kernel_obj
176
+
177
+ @classmethod
178
+ def get_kernel(cls):
179
+ if cls._kernel is None:
180
+ raise RuntimeError("kernel 尚未初始化")
181
+ return cls._kernel
182
+
183
+ @classmethod
184
+ def set_codec(cls, codec_type: CodecType):
185
+ cls._codec_type = codec_type
186
+
187
+ @classmethod
188
+ def get_codec(cls):
189
+ if cls._codec_type == CodecType.IMAGE:
190
+ return JupyterImageCodec
191
+ return JupyterTextCodec
@@ -0,0 +1,34 @@
1
+ # 本目录下只是演示如何封装API
2
+
3
+ import inspect
4
+ from functools import wraps
5
+
6
+ from jupyter_data_fetch.codec import LazyKernel
7
+
8
+
9
+ def auto_execute(func):
10
+ """
11
+ 1. 在Notebook中用`help()`得到函数签名,然后套装饰器
12
+ 2. 调用时必须单独一行
13
+ 3. 建议只是临时使用,还是要使用完整版
14
+ """
15
+
16
+ @wraps(func)
17
+ def wrapper(*args, **kwargs):
18
+ kernel = LazyKernel.get_kernel()
19
+ codec = LazyKernel.get_codec()
20
+
21
+ frame = inspect.currentframe().f_back
22
+ call_line = inspect.getframeinfo(frame).code_context[0].strip()
23
+
24
+ code = f"""
25
+ # 在外部调用时,必须单独成一行
26
+ # 部分函数拼接后有缺失时,需退回到原始写法
27
+
28
+ _ = {call_line}
29
+ """
30
+ # print(code)
31
+ reply = kernel.execute(codec.generate_code(code, var_name='_'))
32
+ return codec.extract_decode(reply)
33
+
34
+ return wrapper
@@ -0,0 +1,62 @@
1
+ # Notebook中可以通过help()或??获得函数签名
2
+
3
+ from jupyter_data_fetch.codec import LazyKernel, TO_DICT_CODE, dict_to_object
4
+
5
+
6
+ def get_all_securities(types=[], date=None):
7
+ kernel = LazyKernel.get_kernel()
8
+ codec = LazyKernel.get_codec()
9
+ code = f"""_ = get_all_securities({repr(types)}, {repr(date)})"""
10
+ # print(code)
11
+ return codec.extract_decode(kernel.execute(codec.generate_code(code, var_name='_')))
12
+
13
+
14
+ def get_price(security, start_date=None, end_date=None, frequency='daily', fields=None, skip_paused=False, fq='pre', count=None, panel=True, fill_paused=True, round=True):
15
+ kernel = LazyKernel.get_kernel()
16
+ codec = LazyKernel.get_codec()
17
+ code = f"""_ = get_price({repr(security)}, {repr(start_date)}, {repr(end_date)}, {repr(frequency)}, {repr(fields)}, {repr(skip_paused)}, {repr(fq)}, {repr(count)}, {repr(panel)}, {repr(fill_paused)}, {repr(round)})"""
18
+ # print(code)
19
+ return codec.extract_decode(kernel.execute(codec.generate_code(code, var_name='_')))
20
+
21
+
22
+ def get_security_info(code, date=None):
23
+ kernel = LazyKernel.get_kernel()
24
+ codec = LazyKernel.get_codec()
25
+ code = f"""
26
+ {TO_DICT_CODE}
27
+
28
+ _ = get_security_info({repr(code)}, {repr(date)}) # ModuleNotFoundError: No module named 'jqdata'
29
+ _ = object_to_dict(_)
30
+ """
31
+ _ = codec.extract_decode(kernel.execute(codec.generate_code(code, var_name='_')))
32
+ return dict_to_object(_) # 字典还原成对象
33
+
34
+
35
+ def get_fundamentals(query_object: str, date=None, statDate=None):
36
+ """注意:原函数是传的query_object,但这要将str当object,所以不能加repr"""
37
+ kernel = LazyKernel.get_kernel()
38
+ codec = LazyKernel.get_codec()
39
+ code = f"""_ = get_fundamentals({query_object}, {repr(date)}, {repr(statDate)})"""
40
+ # print(code)
41
+ return codec.extract_decode(kernel.execute(codec.generate_code(code, var_name='_')))
42
+
43
+
44
+ def get_index_weights(index_id, date=None):
45
+ kernel = LazyKernel.get_kernel()
46
+ codec = LazyKernel.get_codec()
47
+ code = f"""_ = get_index_weights({repr(index_id)}, {repr(date)})"""
48
+ return codec.extract_decode(kernel.execute(codec.generate_code(code, var_name='_')))
49
+
50
+
51
+ def get_extras(info, security_list, start_date=None, end_date='2015-12-31', df=True, count=None):
52
+ kernel = LazyKernel.get_kernel()
53
+ codec = LazyKernel.get_codec()
54
+ code = f"""_ = get_extras({repr(info)}, {repr(security_list)}, {repr(start_date)}, {repr(end_date)}, {repr(df)}, {repr(count)})"""
55
+ return codec.extract_decode(kernel.execute(codec.generate_code(code, var_name='_')))
56
+
57
+
58
+ def get_industry(security, date=None):
59
+ kernel = LazyKernel.get_kernel()
60
+ codec = LazyKernel.get_codec()
61
+ code = f"""_ = get_industry({repr(security)}, {repr(date)})"""
62
+ return codec.extract_decode(kernel.execute(codec.generate_code(code, var_name='_')))
@@ -0,0 +1,25 @@
1
+ [project]
2
+ name = "jupyter-data-fetch"
3
+ description = "fetch data from jupyter notebook"
4
+ readme = "README.md"
5
+ requires-python = ">=3.9"
6
+ dynamic = ["version"]
7
+ dependencies = [
8
+ 'jupyter_kernel_client',
9
+ 'pandas<3.0',
10
+ "pillow", # JupyterImageCodecc才需要
11
+ ]
12
+
13
+ [build-system]
14
+ requires = ["hatchling"]
15
+ build-backend = "hatchling.build"
16
+
17
+ [tool.hatch.version]
18
+ path = "jupyter_data_fetch/_version.py"
19
+
20
+ [tool.hatch.build.targets.wheel]
21
+ packages = ["jupyter_data_fetch"]
22
+ include-package-data = true
23
+
24
+ [tool.hatch.build.targets.sdist]
25
+ include = ["jupyter_data_fetch*"]