dsslab-wdc-client 0.10.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,63 @@
1
+ Metadata-Version: 2.3
2
+ Name: dsslab-wdc-client
3
+ Version: 0.10.0
4
+ Summary: A small client for the WDC-Rest-API.
5
+ Author: Tino Schöllhorn
6
+ Author-email: t.schoellhorn@uni-mannheim.de
7
+ Requires-Python: >=3.10
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.10
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Provides-Extra: docs
14
+ Requires-Dist: Sphinx (==7.4.7) ; extra == "docs"
15
+ Requires-Dist: networkx (>=3.0)
16
+ Requires-Dist: pandas[excel, plot] (>=2.2,<3.0)
17
+ Requires-Dist: python-dotenv (>=1.1.0)
18
+ Requires-Dist: requests (>=2.3)
19
+ Requires-Dist: sphinx-rtd-theme (==3.0.2) ; extra == "docs"
20
+ Requires-Dist: sphinxcontrib-napoleon (==0.7) ; extra == "docs"
21
+ Project-URL: Documentation, https://dss-wdc.wiso.uni-hamburg.de/dss_wdc_client_docs/html/index.html
22
+ Project-URL: Homepage, https://dss-wdc.wiso.uni-hamburg.de/
23
+ Description-Content-Type: text/markdown
24
+
25
+ # Description
26
+ This project includes a *very* small client to access data form
27
+ the WebDataCollector-API. It is meant to provide a simple means
28
+ of accessing the data in Json or as Panda-DataFrames.
29
+
30
+ # Usage
31
+
32
+ ```
33
+ from dsslab.wdc_client import *
34
+
35
+ # Somehow initialize environment variables for
36
+ # "WDC_HOST" and "WDC_TOKEN".
37
+ load_dotenv()
38
+
39
+ client = WDCClient.fromEnv()
40
+ df = client.loadAsDataFrame(
41
+ 'api/endpoint...', {'param1', 'value will be encoded'})
42
+ ```
43
+
44
+ For more information about the client and usable endpoints,
45
+ see the project homepage of WDC or directly consult the Documentation.
46
+
47
+ # Changelog
48
+ - 0.10.0 Moved to new package structure with the namespace dsslab.
49
+
50
+ # Old changelog
51
+ - 0.9.0 Add method WDCClient#put to create PUT-Requests
52
+ - 0.8.2 Fix for duplicate parameters when paging.
53
+ - 0.8.1 Simplify new methods for loading a DomainGraph.
54
+ - 0.8.0 Added new methods for loading DomainGraphs.
55
+ - 0.7.3 Fix README
56
+ - 0.7.2 Include link for generated documentation
57
+ - 0.7.1 Added generated documentation
58
+ - 0.7.0 Added new method WDCClient.loadDomainGraph for loading a DomainGraph as NetworkX-Object
59
+ - 0.6.0 WDCClient throws an WDCException if a request to the server fails
60
+ - 0.5.0 New signatures and methods taking care of encodings and working on large results
61
+ - 0.4.3 Add dependencies pandas["excel, plot"] as they are likely to be used.
62
+ - 0.4.2 Enhance README with Changelog and code-example.
63
+ - 0.4.1 Include a preferred variant for creating WDCClients from the Environment
@@ -0,0 +1,39 @@
1
+ # Description
2
+ This project includes a *very* small client to access data form
3
+ the WebDataCollector-API. It is meant to provide a simple means
4
+ of accessing the data in Json or as Panda-DataFrames.
5
+
6
+ # Usage
7
+
8
+ ```
9
+ from dsslab.wdc_client import *
10
+
11
+ # Somehow initialize environment variables for
12
+ # "WDC_HOST" and "WDC_TOKEN".
13
+ load_dotenv()
14
+
15
+ client = WDCClient.fromEnv()
16
+ df = client.loadAsDataFrame(
17
+ 'api/endpoint...', {'param1', 'value will be encoded'})
18
+ ```
19
+
20
+ For more information about the client and usable endpoints,
21
+ see the project homepage of WDC or directly consult the Documentation.
22
+
23
+ # Changelog
24
+ - 0.10.0 Moved to new package structure with the namespace dsslab.
25
+
26
+ # Old changelog
27
+ - 0.9.0 Add method WDCClient#put to create PUT-Requests
28
+ - 0.8.2 Fix for duplicate parameters when paging.
29
+ - 0.8.1 Simplify new methods for loading a DomainGraph.
30
+ - 0.8.0 Added new methods for loading DomainGraphs.
31
+ - 0.7.3 Fix README
32
+ - 0.7.2 Include link for generated documentation
33
+ - 0.7.1 Added generated documentation
34
+ - 0.7.0 Added new method WDCClient.loadDomainGraph for loading a DomainGraph as NetworkX-Object
35
+ - 0.6.0 WDCClient throws an WDCException if a request to the server fails
36
+ - 0.5.0 New signatures and methods taking care of encodings and working on large results
37
+ - 0.4.3 Add dependencies pandas["excel, plot"] as they are likely to be used.
38
+ - 0.4.2 Enhance README with Changelog and code-example.
39
+ - 0.4.1 Include a preferred variant for creating WDCClients from the Environment
@@ -0,0 +1,40 @@
1
+ [tool.poetry]
2
+ name = "dsslab-wdc-client"
3
+ packages = [
4
+ { include = "dsslab", from = "src" }
5
+ ]
6
+ version = "0.10.0"
7
+ description = "A small client for the WDC-Rest-API."
8
+ authors = ["Tino Schöllhorn <t.schoellhorn@uni-mannheim.de>"]
9
+ readme = "README.md"
10
+ homepage = "https://dss-wdc.wiso.uni-hamburg.de/"
11
+ documentation = "https://dss-wdc.wiso.uni-hamburg.de/dss_wdc_client_docs/html/index.html"
12
+
13
+ [tool.poetry.dependencies]
14
+ python = ">=3.10"
15
+ requests = ">=2.3"
16
+ python-dotenv = ">=1.1.0"
17
+ pandas = {extras = ["excel, plot"], version = "^2.2"}
18
+ networkx = ">=3.0"
19
+
20
+ # Docs
21
+ Sphinx = { version = "7.4.7", optional = true }
22
+ sphinx-rtd-theme = { version = "3.0.2", optional = true }
23
+ sphinxcontrib-napoleon = { version = "0.7", optional = true }
24
+
25
+ [tool.poetry.extras]
26
+ docs = ["Sphinx", "sphinx-rtd-theme", "sphinxcontrib-napoleon"]
27
+
28
+
29
+ [tool.poetry.group.dev.dependencies]
30
+ pytest = "^8.3.3"
31
+
32
+ [tool.pytest.ini_options]
33
+ log_cli = true
34
+ log_cli_level = "DEBUG"
35
+ log_cli_format = "%(asctime)s [%(levelname)8s-%(name)s]: %(message)s (%(filename)s:%(lineno)s)"
36
+ log_cli_date_format = "%Y-%m-%d %H:%M:%S"
37
+
38
+ [build-system]
39
+ requires = ["poetry-core"]
40
+ build-backend = "poetry.core.masonry.api"
@@ -0,0 +1 @@
1
+ from .client import *
@@ -0,0 +1,266 @@
1
+ """
2
+ client.py
3
+ ====================================
4
+ The core module provides access to the WDC-API.
5
+ It provides a consistent interface for handling
6
+ the data as JSON, DataFrame or Graph and transparently handles
7
+ paging for large result-sets.
8
+ """
9
+
10
+ import requests
11
+ import logging
12
+ import os
13
+ import json
14
+ from typing import Any
15
+ from collections.abc import Callable
16
+ import networkx as nx
17
+
18
+ import pandas as pd
19
+
20
+ class WDCException(Exception):
21
+ """
22
+ An exception which is raised when an error within the WDCClient
23
+ occurs.
24
+ """
25
+
26
+ def __init__(self, message, query = None, state = None):
27
+ super().__init__(message)
28
+ self.query = query
29
+ self.state = state;
30
+
31
+ def __str__(self):
32
+ return (
33
+ super().__str__() +
34
+ ", query: " + self.query +
35
+ ", state: " + self.state)
36
+
37
+ class WDCClient:
38
+ """
39
+ Client for the WDC-API.
40
+ """
41
+
42
+ @staticmethod
43
+ def fromEnv():
44
+ """
45
+ Creates a WDCClient from the 'Environment'.
46
+
47
+ Uses the environment variables 'WDC_HOST' and 'WDC_TOKEN' from the current
48
+ environment. Thus, you can make use of modules such as python-dotenv
49
+ or other variants more easily.
50
+
51
+ Remember: Using passwords or tokens in source code is dangerous!
52
+
53
+ :return: A new WDCClient configured from the environment values.
54
+ """
55
+ _host = os.getenv('WDC_HOST')
56
+ _token = os.getenv('WDC_TOKEN')
57
+
58
+ client = WDCClient(host = _host, token = _token)
59
+
60
+ return client
61
+
62
+ def __init__(self, host: str, token = None):
63
+ self.logger = logging.getLogger(__name__)
64
+ self.host = host
65
+ self.token = token
66
+
67
+ if self.host == None:
68
+ raise WDCException("Could not create WDCClient with host = None")
69
+
70
+ self.session = requests.Session()
71
+ if self.token != None:
72
+ self.session.headers.update({'token': self.token})
73
+
74
+
75
+ def loadAsDataFrame(self, endpoint: str, params: dict[str, Any] = {}) -> pd.DataFrame:
76
+ """
77
+ Loads the *complete* tabular data from the endpoint and returns a
78
+ Pandas-DataFrame. The method transparently pages through the
79
+ complete results.
80
+
81
+ :param endpoint: the endpoint
82
+ :param params: a dictionary with possible parameters for the
83
+ query-string of the request. Values will be properly encoded.
84
+
85
+ :return: the data as Pands-DataFrame
86
+ """
87
+ json = self.loadAsJson(endpoint, params);
88
+
89
+ return pd.json_normalize(json)
90
+
91
+ def loadAsJson(self, endpoint: str, params: dict[str, Any] = {}) -> []:
92
+ """
93
+ Loads the tabular data from the endpoint and returns it as
94
+ JSON-Array. The method transparently pages through the
95
+ complete results.
96
+
97
+ :param endpoint: the endpoint
98
+ :param params: a dictionary with possible parameters for the
99
+ query-string of the request. Values will be properly encoded.
100
+
101
+ :return: the data as JSON-Array
102
+ """
103
+ res = []
104
+
105
+ def collect_it(e, pos, maxPos):
106
+ nonlocal res
107
+ res.append(e)
108
+
109
+ self.loadForEach(endpoint, params, collect_it)
110
+
111
+ return res
112
+
113
+ def loadForEach(self, endpoint: str, params: dict[str, Any] = {}, f: Callable[[Any, int, int], None] = None) -> None:
114
+ """
115
+ Provides the means to work on larger resultsets by providing a Callback.
116
+
117
+ :param endpoint: the endpoint
118
+ :param params: a dictionary with possible parameters for the
119
+ query-string of the request. Values will be properly encoded.
120
+ :param f: a Callable-Object (function, ...) with the signature (row, currentPos, maxPos) as a callback to work on
121
+ each entry in the dataset.
122
+ """
123
+ url = self.host + "/" + endpoint
124
+
125
+ self.logger.debug('endpoint:' + url + ', params:' + str(params))
126
+
127
+ counter = 1
128
+ while url != None:
129
+ # nur beim ersten request dürfen die Params genutzt werden
130
+ # Ansonsten kommt es ja über den nextLink
131
+ if counter == 1:
132
+ response = self.session.get(url, params = params)
133
+ else:
134
+ response = self.session.get(url)
135
+
136
+ self.logger.debug("headers: %s", response.headers)
137
+
138
+ json = response.json()
139
+
140
+ # Everything ok?
141
+ if json['responseHeader']['state'] != 'OK':
142
+ raise WDCException(
143
+ json['responseHeader']['msg'],
144
+ query = json['responseHeader']['query'],
145
+ state = json['responseHeader']['state'])
146
+
147
+ self.logger.debug("json: %s", json)
148
+
149
+ for e in json["content"]:
150
+ f(e, counter, json['page']['totalElements'])
151
+ counter += 1
152
+
153
+ # gehts weiter?
154
+ if 'links' in json and 'next' in json['links']:
155
+ url = json['links']['next']
156
+ self.logger.debug("nextLink %s", url)
157
+ else:
158
+ url = None
159
+
160
+ def put(self, endpoint: str, body: str, params: dict[str, Any] = {}) -> None:
161
+ """
162
+ Executes a PUT request to the specified endpoint with a body.
163
+ Raises a WDCException if the response is not "OK".
164
+
165
+ :param endpoint: the endpoint
166
+ :param body: the body to send with the PUT-Request
167
+ """
168
+ url = self.host + "/" + endpoint
169
+
170
+ response = self.session.put(url, data=body, **params)
171
+
172
+ #self.logger.debug("headers: %s", response.headers)
173
+ #self.logger.debug("response: %s", response.status_code)
174
+
175
+ if response.status_code != 200 and response.status_code != 201:
176
+ raise WDCException("Could not send PUT for url: " + url)
177
+
178
+ def __str__(self) -> str:
179
+ return "[host=" + str(self.host) + ", token=" + str(self.token) + "}"
180
+
181
+ def loadDomainGraph(self, snapshot: str, selection: str = None, variant: str = 'ONLY_SEEDS') -> nx.DiGraph:
182
+ """
183
+ Loads a DomainGraph as a DiGraph.
184
+
185
+ Note: If you intend to "merge" other data to nodes or edges, it
186
+ might be simpler to use the methods loadDomainGraphNodes()
187
+ and lodDomainGraphEdges() to load the data, modify it and
188
+ create the graph with createGraph().
189
+
190
+ :param snapshot: The machineName of the snapshot.
191
+ :param selection: A selection of the snapshot.
192
+ :param variant: A value of an enumeration of the variant of the DomainGraph.
193
+
194
+ :return: DiGraph of the DomainGraph
195
+ """
196
+ domainGraphId = self.findDomainGraphId(snapshot, selection, variant)
197
+
198
+ nodes, edges = self.loadDomainGraphData(snapshot, selection, variant)
199
+
200
+ return self.createDomainGraph(nodes, edges)
201
+
202
+
203
+ def findDomainGraphId(self, snapshot: str, selection: str = None, variant: str = 'ONLY_SEEDS'):
204
+ """
205
+ Finds the DomainGraphId of the specified Graph.
206
+
207
+ :param snapshot: The machineName of the snapshot.
208
+ :param selection: A selection of the snapshot.
209
+ :param variant: A value of an enumeration of the variant of the DomainGraph.
210
+
211
+ :return: the ID of the specified DomainGraph.
212
+ """
213
+ domainGraphs = self.loadAsJson(
214
+ f"/api/domaingraph/list",
215
+ {
216
+ "snapshot": snapshot,
217
+ "selection": selection,
218
+ "variant": variant
219
+ })
220
+
221
+ self.logger.debug("domainGraphs:" + str(domainGraphs))
222
+
223
+ # es darf nur einer sein
224
+ if len(domainGraphs) != 1:
225
+ raise WDCException("There must be exactly one DomainGraph but found: " + len(domainGraphs))
226
+
227
+ return domainGraphs[0]['id']
228
+
229
+ def loadDomainGraphData(self, snapshot: str, selection: str = None, variant: str = 'ONLY_SEEDS'):
230
+ """
231
+ Convienence method load the nodes *and* edges of a DomainGraph.
232
+
233
+ :return: a tuple of (nodes, edges) as JSON.
234
+ """
235
+ domainGraphId = self.findDomainGraphId(snapshot, selection, variant)
236
+ nodes = self.loadAsJson(f"/api/domaingraph/{domainGraphId}/nodes")
237
+ edges = self.loadAsJson(f"/api/domaingraph/{domainGraphId}/edges")
238
+
239
+ return nodes, edges
240
+
241
+ def createDomainGraph(self, nodes, edges) -> nx.DiGraph:
242
+ """
243
+ Create a DiGraph from a list of nodes and a list of edges.
244
+
245
+ :param nodes a JSON-Object or a DataFrame
246
+ :param edges a JSON-Object or a DataFrame
247
+
248
+ :return the created DiGraph.
249
+ """
250
+
251
+ if isinstance(nodes, pd.DataFrame):
252
+ nodes = json.loads(nodes.to_json(orient="records"));
253
+
254
+ if isinstance(edges, pd.DataFrame):
255
+ edges = json.loads(edges.to_json(orient="records"));
256
+
257
+ # Graph bauen
258
+ graph = nx.DiGraph()
259
+ for n in nodes:
260
+ _id = n.pop("id")
261
+ graph.add_node(_id, **n)
262
+
263
+ for e in edges:
264
+ graph.add_edge(e['source'], e['target'], weight=e['weight'])
265
+
266
+ return graph