mercuto-client 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mercuto-client might be problematic. Click here for more details.
- mercuto_client/__init__.py +30 -0
- mercuto_client/_tests/__init__.py +0 -0
- mercuto_client/_tests/conftest.py +0 -0
- mercuto_client/_tests/test_ingester/__init__.py +0 -0
- mercuto_client/_tests/test_ingester/test_file_processor.py +210 -0
- mercuto_client/_tests/test_ingester/test_ftp.py +37 -0
- mercuto_client/_tests/test_ingester/test_parsers.py +145 -0
- mercuto_client/_tests/test_mocking.py +93 -0
- mercuto_client/_util.py +13 -0
- mercuto_client/acl.py +101 -0
- mercuto_client/client.py +903 -0
- mercuto_client/exceptions.py +15 -0
- mercuto_client/ingester/__init__.py +0 -0
- mercuto_client/ingester/__main__.py +287 -0
- mercuto_client/ingester/ftp.py +115 -0
- mercuto_client/ingester/parsers/__init__.py +42 -0
- mercuto_client/ingester/parsers/campbell.py +12 -0
- mercuto_client/ingester/parsers/generic_csv.py +114 -0
- mercuto_client/ingester/parsers/worldsensing.py +23 -0
- mercuto_client/ingester/processor.py +291 -0
- mercuto_client/ingester/util.py +64 -0
- mercuto_client/mocks.py +203 -0
- mercuto_client/py.typed +0 -0
- mercuto_client/types.py +409 -0
- mercuto_client-0.1.0.dist-info/METADATA +16 -0
- mercuto_client-0.1.0.dist-info/RECORD +29 -0
- mercuto_client-0.1.0.dist-info/WHEEL +5 -0
- mercuto_client-0.1.0.dist-info/licenses/LICENSE +619 -0
- mercuto_client-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class MercutoClientException(Exception):
|
|
5
|
+
pass
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class MercutoHTTPException(MercutoClientException):
|
|
9
|
+
def __init__(self, message: str, status_code: int) -> None:
|
|
10
|
+
super().__init__(message)
|
|
11
|
+
self.status_code = status_code
|
|
12
|
+
self.message = message
|
|
13
|
+
|
|
14
|
+
def json(self) -> dict:
|
|
15
|
+
return json.loads(self.message)
|
|
File without changes
|
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import fnmatch
|
|
3
|
+
import itertools
|
|
4
|
+
import logging
|
|
5
|
+
import logging.handlers
|
|
6
|
+
import os
|
|
7
|
+
import sys
|
|
8
|
+
import time
|
|
9
|
+
from typing import Callable, TypeVar
|
|
10
|
+
|
|
11
|
+
import schedule
|
|
12
|
+
|
|
13
|
+
from .. import MercutoClient, MercutoHTTPException
|
|
14
|
+
from ..types import DataSample
|
|
15
|
+
from .ftp import simple_ftp_server
|
|
16
|
+
from .parsers import detect_parser
|
|
17
|
+
from .processor import FileProcessor
|
|
18
|
+
from .util import batched, get_free_space_excluding_files, get_my_public_ip
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
NON_RETRYABLE_ERRORS = {400, 404, 409} # HTTP status codes that indicate non-retryable errors
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class MercutoIngester:
|
|
26
|
+
def __init__(self, project_code: str, api_key: str, hostname: str = 'https://api.rockfieldcloud.com.au') -> None:
|
|
27
|
+
self._client = MercutoClient(url=hostname)
|
|
28
|
+
self._api_key = api_key
|
|
29
|
+
with self._client.as_credentials(api_key=api_key) as client:
|
|
30
|
+
self._project = client.projects().get_project(project_code)
|
|
31
|
+
assert self._project['code'] == project_code
|
|
32
|
+
|
|
33
|
+
self._secondary_channels = client.channels().get_channels(project_code, classification='SECONDARY')
|
|
34
|
+
self._datatables = list(itertools.chain.from_iterable([dt['datatables'] for dt in client.devices().list_dataloggers(project_code)]))
|
|
35
|
+
|
|
36
|
+
self._channel_map = {c['label']: c['code'] for c in self._secondary_channels}
|
|
37
|
+
|
|
38
|
+
def update_mapping(self, mapping: dict[str, str]) -> None:
|
|
39
|
+
"""
|
|
40
|
+
Update the channel label to channel code mapping.
|
|
41
|
+
"""
|
|
42
|
+
self._channel_map.update(mapping)
|
|
43
|
+
logger.info(f"Updated channel mapping: {self._channel_map}")
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def project_code(self) -> str:
|
|
47
|
+
return self._project['code']
|
|
48
|
+
|
|
49
|
+
def ping(self) -> None:
|
|
50
|
+
"""
|
|
51
|
+
Ping the Mercuto serverto update the last seen IP address.
|
|
52
|
+
"""
|
|
53
|
+
ip = get_my_public_ip()
|
|
54
|
+
with self._client.as_credentials(api_key=self._api_key) as client:
|
|
55
|
+
client.projects().ping_project(self.project_code, ip_address=ip)
|
|
56
|
+
logging.info(f"Pinged Mercuto server from IP: {ip} for project: {self.project_code}")
|
|
57
|
+
|
|
58
|
+
def matching_datatable(self, filename: str) -> str | None:
|
|
59
|
+
"""
|
|
60
|
+
Check if any datatables on the project match this file name.
|
|
61
|
+
Returns the datatable code if a match is found, otherwise None.
|
|
62
|
+
"""
|
|
63
|
+
basename = os.path.basename(filename)
|
|
64
|
+
|
|
65
|
+
def matches(test: str) -> bool:
|
|
66
|
+
"""
|
|
67
|
+
test should be a pattern or a filename.
|
|
68
|
+
E.g. "my_data.csv" or "my_data*.csv", or "/path/to/my_data*.csv"
|
|
69
|
+
Do wildcard matching as well as prefix matching.
|
|
70
|
+
"""
|
|
71
|
+
test_base = os.path.basename(test)
|
|
72
|
+
if fnmatch.fnmatch(basename, test_base):
|
|
73
|
+
return True
|
|
74
|
+
lhs, _ = os.path.splitext(test_base)
|
|
75
|
+
if basename.startswith(lhs):
|
|
76
|
+
return True
|
|
77
|
+
return False
|
|
78
|
+
|
|
79
|
+
for dt in self._datatables:
|
|
80
|
+
# Match using datatable pattern
|
|
81
|
+
if matches(dt['name']):
|
|
82
|
+
return dt['code']
|
|
83
|
+
if dt['src'] and matches(dt['src']):
|
|
84
|
+
return dt['code']
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
def _upload_samples(self, samples: list[DataSample]) -> bool:
|
|
88
|
+
"""
|
|
89
|
+
Upload samples to the Mercuto project.
|
|
90
|
+
"""
|
|
91
|
+
try:
|
|
92
|
+
with self._client.as_credentials(api_key=self._api_key) as client:
|
|
93
|
+
for batch in batched(samples, 500):
|
|
94
|
+
client.data().upload_samples(batch)
|
|
95
|
+
return True
|
|
96
|
+
except MercutoHTTPException as e:
|
|
97
|
+
if e.status_code in NON_RETRYABLE_ERRORS:
|
|
98
|
+
logger.exception(
|
|
99
|
+
"Error indicates bad file that should not be retried. Skipping.")
|
|
100
|
+
return True
|
|
101
|
+
else:
|
|
102
|
+
return False
|
|
103
|
+
|
|
104
|
+
def _upload_file(self, file_path: str, datatable_code: str) -> bool:
|
|
105
|
+
"""
|
|
106
|
+
Upload a file to the Mercuto project.
|
|
107
|
+
"""
|
|
108
|
+
logging.info(f"Uploadeding file {file_path} to datatable {datatable_code} in project {self.project_code}")
|
|
109
|
+
try:
|
|
110
|
+
with self._client.as_credentials(api_key=self._api_key) as client:
|
|
111
|
+
client.data().upload_file(
|
|
112
|
+
project=self.project_code,
|
|
113
|
+
datatable=datatable_code,
|
|
114
|
+
file=file_path,
|
|
115
|
+
)
|
|
116
|
+
return True
|
|
117
|
+
except MercutoHTTPException as e:
|
|
118
|
+
if e.status_code in NON_RETRYABLE_ERRORS:
|
|
119
|
+
logger.exception(
|
|
120
|
+
"Error indicates bad file that should not be retried. Skipping.")
|
|
121
|
+
return True
|
|
122
|
+
else:
|
|
123
|
+
return False
|
|
124
|
+
|
|
125
|
+
def process_file(self, file_path: str) -> bool:
|
|
126
|
+
"""
|
|
127
|
+
Process the received file.
|
|
128
|
+
"""
|
|
129
|
+
logging.info(f"Processing file: {file_path}")
|
|
130
|
+
datatable_code = self.matching_datatable(file_path)
|
|
131
|
+
if datatable_code:
|
|
132
|
+
logger.info(f"Matched datatable code: {datatable_code} for file: {file_path}")
|
|
133
|
+
return self._upload_file(file_path, datatable_code)
|
|
134
|
+
else:
|
|
135
|
+
parser = detect_parser(file_path)
|
|
136
|
+
samples = parser(file_path, self._channel_map)
|
|
137
|
+
if not samples:
|
|
138
|
+
logging.warning(f"No samples found in file: {file_path}")
|
|
139
|
+
return True
|
|
140
|
+
return self._upload_samples(samples)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
T = TypeVar('T')
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def call_and_log_error(func: Callable[[], T]) -> T | None:
|
|
147
|
+
"""
|
|
148
|
+
Call a function and log any exceptions that occur.
|
|
149
|
+
"""
|
|
150
|
+
try:
|
|
151
|
+
return func()
|
|
152
|
+
except Exception:
|
|
153
|
+
logging.exception(f"Error in {func.__name__}")
|
|
154
|
+
return None
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
if __name__ == '__main__':
|
|
158
|
+
parser = argparse.ArgumentParser(description='Mercuto Ingester CLI')
|
|
159
|
+
parser.add_argument('-p', '--project', type=str,
|
|
160
|
+
required=True, help='Mercuto project code')
|
|
161
|
+
parser.add_argument('-k', '--api-key', type=str,
|
|
162
|
+
required=True, help='API key for Mercuto')
|
|
163
|
+
parser.add_argument('-v', '--verbose', action='store_true',
|
|
164
|
+
help='Enable verbose output')
|
|
165
|
+
parser.add_argument('-d', '--directory', type=str,
|
|
166
|
+
help='Directory to store ingested files. Default is a directory called `buffered-files` in the workdir.')
|
|
167
|
+
parser.add_argument('-s', '--size', type=int,
|
|
168
|
+
help='Size in MB for total amount of files to store in the buffer. \
|
|
169
|
+
Default is 75% of the available disk space on the buffer partition excluding the directory itself', default=None)
|
|
170
|
+
parser.add_argument('--max-attempts', type=int,
|
|
171
|
+
help='Maximum number of attempts to process a file before giving up. Default is 1000.',
|
|
172
|
+
default=1000)
|
|
173
|
+
parser.add_argument('--workdir', type=str,
|
|
174
|
+
help='Working directory for the ingester. Default is ~/.mercuto-ingester',)
|
|
175
|
+
parser.add_argument('--logfile', type=str,
|
|
176
|
+
help='Log file path. No logs written if not provided. Maximum of 4 log files of 1MB each will be kept.\
|
|
177
|
+
Default is log.txt in the workdir.')
|
|
178
|
+
parser.add_argument('--mapping', type=str,
|
|
179
|
+
help='Path to a JSON file with channel label to channel code mapping.\
|
|
180
|
+
If not provided, the ingester will try to detect the channels from the project.',
|
|
181
|
+
default=None)
|
|
182
|
+
parser.add_argument('--hostname', type=str,
|
|
183
|
+
help='Hostname to use for the Mercuto server. Default is "https://api.rockfieldcloud.com.au".',
|
|
184
|
+
default='https://api.rockfieldcloud.com.au')
|
|
185
|
+
parser.add_argument('--clean',
|
|
186
|
+
help='Drop the database before starting. This will not remove any buffer files and will rescan them on startup.',
|
|
187
|
+
action='store_true')
|
|
188
|
+
parser.add_argument('--username', type=str,
|
|
189
|
+
help='Username for the FTP server. Default is "logger".',
|
|
190
|
+
default='logger')
|
|
191
|
+
parser.add_argument('--password', type=str,
|
|
192
|
+
help='Password for the FTP server. Default is "password".',
|
|
193
|
+
default='password')
|
|
194
|
+
parser.add_argument('--port', type=int,
|
|
195
|
+
help='Port for the FTP server. Default is 2121.',
|
|
196
|
+
default=2121)
|
|
197
|
+
parser.add_argument('--no-rename', action='store_true',
|
|
198
|
+
help='Add the current timestamp to the end of the files received via FTP. \
|
|
199
|
+
This is useful to avoid overwriting files with the same name.')
|
|
200
|
+
|
|
201
|
+
args = parser.parse_args()
|
|
202
|
+
|
|
203
|
+
if args.workdir is None:
|
|
204
|
+
workdir = os.path.join(os.path.expanduser('~'), ".mercuto-ingester")
|
|
205
|
+
else:
|
|
206
|
+
workdir = args.workdir
|
|
207
|
+
if not os.path.exists(args.workdir):
|
|
208
|
+
raise ValueError(f"Work directory {args.workdir} does not exist")
|
|
209
|
+
os.makedirs(workdir, exist_ok=True)
|
|
210
|
+
|
|
211
|
+
if args.verbose:
|
|
212
|
+
level = logging.DEBUG
|
|
213
|
+
else:
|
|
214
|
+
level = logging.INFO
|
|
215
|
+
|
|
216
|
+
handlers: list[logging.Handler] = []
|
|
217
|
+
handlers.append(logging.StreamHandler(sys.stderr))
|
|
218
|
+
|
|
219
|
+
if args.logfile is not None:
|
|
220
|
+
logfile = args.logfile
|
|
221
|
+
else:
|
|
222
|
+
logfile = os.path.join(workdir, 'log.txt')
|
|
223
|
+
handlers.append(logging.handlers.RotatingFileHandler(
|
|
224
|
+
logfile, maxBytes=1000000, backupCount=3))
|
|
225
|
+
|
|
226
|
+
logging.basicConfig(format='[PID %(process)d] %(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
|
|
227
|
+
datefmt='%d/%m/%Y %H:%M:%S',
|
|
228
|
+
level=level,
|
|
229
|
+
handlers=handlers)
|
|
230
|
+
|
|
231
|
+
if args.directory is None:
|
|
232
|
+
buffer_directory = os.path.join(workdir, "buffered-files")
|
|
233
|
+
else:
|
|
234
|
+
buffer_directory = args.directory
|
|
235
|
+
os.makedirs(buffer_directory, exist_ok=True)
|
|
236
|
+
|
|
237
|
+
ftp_dir = os.path.join(workdir, 'temp-ftp-data')
|
|
238
|
+
os.makedirs(ftp_dir, exist_ok=True)
|
|
239
|
+
|
|
240
|
+
size = args.size
|
|
241
|
+
if size is None:
|
|
242
|
+
size = get_free_space_excluding_files(buffer_directory) * 0.75 // (1024 * 1024) # Convert to MB
|
|
243
|
+
logging.info(f"Buffer size set to {size} MB based on available disk space.")
|
|
244
|
+
|
|
245
|
+
if args.mapping is not None:
|
|
246
|
+
import json
|
|
247
|
+
with open(args.mapping, 'r') as f:
|
|
248
|
+
mapping = json.load(f)
|
|
249
|
+
if not isinstance(mapping, dict):
|
|
250
|
+
raise ValueError(f"Mapping file {args.mapping} must contain a JSON object")
|
|
251
|
+
else:
|
|
252
|
+
mapping = {}
|
|
253
|
+
|
|
254
|
+
logger.info(f"Using work directory: {workdir}")
|
|
255
|
+
|
|
256
|
+
database_path = os.path.join(workdir, "buffer.db")
|
|
257
|
+
if args.clean and os.path.exists(database_path):
|
|
258
|
+
logging.info(f"Dropping existing database at {database_path}")
|
|
259
|
+
os.remove(database_path)
|
|
260
|
+
|
|
261
|
+
ingester = MercutoIngester(
|
|
262
|
+
project_code=args.project,
|
|
263
|
+
api_key=args.api_key,
|
|
264
|
+
hostname=args.hostname)
|
|
265
|
+
|
|
266
|
+
ingester.update_mapping(mapping)
|
|
267
|
+
|
|
268
|
+
processor = FileProcessor(
|
|
269
|
+
buffer_dir=buffer_directory,
|
|
270
|
+
db_path=database_path,
|
|
271
|
+
process_callback=ingester.process_file,
|
|
272
|
+
max_attempts=args.max_attempts,
|
|
273
|
+
free_space_mb=size)
|
|
274
|
+
|
|
275
|
+
processor.scan_existing_files()
|
|
276
|
+
|
|
277
|
+
with simple_ftp_server(directory=buffer_directory,
|
|
278
|
+
username=args.username, password=args.password, port=args.port,
|
|
279
|
+
callback=processor.add_file_to_db, rename=not args.no_rename,
|
|
280
|
+
workdir=workdir):
|
|
281
|
+
schedule.every(60).seconds.do(call_and_log_error, ingester.ping)
|
|
282
|
+
schedule.every(5).seconds.do(call_and_log_error, processor.process_next_file)
|
|
283
|
+
schedule.every(2).minutes.do(call_and_log_error, processor.cleanup_old_files)
|
|
284
|
+
|
|
285
|
+
while True:
|
|
286
|
+
schedule.run_pending()
|
|
287
|
+
time.sleep(0.5)
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
import tempfile
|
|
6
|
+
import threading
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from typing import Callable, Iterator, Optional
|
|
9
|
+
|
|
10
|
+
from pyftpdlib import authorizers # type: ignore[import-untyped]
|
|
11
|
+
from pyftpdlib.handlers import FTPHandler # type: ignore[import-untyped]
|
|
12
|
+
from pyftpdlib.servers import FTPServer # type: ignore[import-untyped]
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@contextlib.contextmanager
|
|
18
|
+
def simple_ftp_server(directory: str,
|
|
19
|
+
username: str,
|
|
20
|
+
password: str,
|
|
21
|
+
port: int = 2121,
|
|
22
|
+
callback: Optional[Callable[[str], None]] = None,
|
|
23
|
+
workdir: Optional[str] = None,
|
|
24
|
+
rename: bool = True,
|
|
25
|
+
clock: Optional[Callable[[], datetime]] = None) -> Iterator[None]:
|
|
26
|
+
"""
|
|
27
|
+
Wrapper for a simple FTP server that allows uploading files to a specified directory.
|
|
28
|
+
Callback function can be provided which is called with the destination path of each uploaded file.
|
|
29
|
+
Files are first uploaded to a workdirectory and then moved to the specified directory.
|
|
30
|
+
If workdir is not specified, a temporary directory is used.
|
|
31
|
+
|
|
32
|
+
:param directory: Directory where files will be uploaded.
|
|
33
|
+
:param username: Username for FTP authentication.
|
|
34
|
+
:param password: Password for FTP authentication.
|
|
35
|
+
:param port: Port on which the FTP server will listen.
|
|
36
|
+
:param callback: Optional callback function that is called with the destination path of each uploaded file.
|
|
37
|
+
:param workdir: Optional working directory where files are initially uploaded before moving to the final directory.
|
|
38
|
+
:param rename: If True, appends a timestamp to the filename to avoid overwriting existing files.
|
|
39
|
+
:param clock: Function to get the current time, defaults to datetime.now with timezone UTC
|
|
40
|
+
:return: Context manager that starts the FTP server and allows file uploads.
|
|
41
|
+
|
|
42
|
+
Runs in a background thread during context manager usage.
|
|
43
|
+
|
|
44
|
+
Example usage:
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
def my_callback(dest_path: str):
|
|
48
|
+
print(f"File uploaded to: {dest_path}")
|
|
49
|
+
|
|
50
|
+
with simple_ftp_server('/path/to/upload/dir', 'user', 'pass', port=2121, callback=my_callback) as server:
|
|
51
|
+
# Your code here, e.g. processing files
|
|
52
|
+
while True:
|
|
53
|
+
time.sleep(10)
|
|
54
|
+
```
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
if clock is None:
|
|
58
|
+
def clock(): return datetime.now(timezone.utc)
|
|
59
|
+
|
|
60
|
+
def rename_file(file_path: str) -> str:
|
|
61
|
+
"""
|
|
62
|
+
Rename the file by appending a timestamp to avoid overwriting.
|
|
63
|
+
Adds the timestamp before the file extension.
|
|
64
|
+
"""
|
|
65
|
+
base, ext = os.path.splitext(file_path)
|
|
66
|
+
timestamp = clock().strftime("%Y%m%dT%H%M%S")
|
|
67
|
+
new_name = f"{base}_{timestamp}{ext}"
|
|
68
|
+
return new_name
|
|
69
|
+
|
|
70
|
+
class CustomFTPHandler(FTPHandler):
|
|
71
|
+
def on_file_received(self, file):
|
|
72
|
+
target = os.path.join(directory, os.path.basename(file))
|
|
73
|
+
|
|
74
|
+
if rename:
|
|
75
|
+
target = rename_file(target)
|
|
76
|
+
|
|
77
|
+
dest = shutil.move(file, target)
|
|
78
|
+
if callback:
|
|
79
|
+
callback(dest)
|
|
80
|
+
|
|
81
|
+
def on_incomplete_file_received(self, file):
|
|
82
|
+
try:
|
|
83
|
+
os.remove(file)
|
|
84
|
+
except Exception:
|
|
85
|
+
logger.exception(f"Failed to remove incomplete file: {file}")
|
|
86
|
+
|
|
87
|
+
if workdir is None:
|
|
88
|
+
workdir_ctx: contextlib.AbstractContextManager[str] = tempfile.TemporaryDirectory(prefix="ftp_workdir_")
|
|
89
|
+
else:
|
|
90
|
+
workdir_ctx = contextlib.nullcontext(workdir)
|
|
91
|
+
|
|
92
|
+
with workdir_ctx as workdir:
|
|
93
|
+
authorizer = authorizers.DummyAuthorizer()
|
|
94
|
+
authorizer.add_user(username, password,
|
|
95
|
+
workdir, perm='lwe')
|
|
96
|
+
handler = CustomFTPHandler
|
|
97
|
+
handler.authorizer = authorizer
|
|
98
|
+
handler.banner = "FTP Server Ready."
|
|
99
|
+
handler.passive_ports = range(60000, 65535)
|
|
100
|
+
|
|
101
|
+
address = ('0.0.0.0', port)
|
|
102
|
+
server = FTPServer(address, handler)
|
|
103
|
+
server.max_cons = 60
|
|
104
|
+
server.max_cons_per_ip = 20
|
|
105
|
+
|
|
106
|
+
server_thread = threading.Thread(target=server.serve_forever, daemon=True)
|
|
107
|
+
logger.debug(f"Starting FTP server on {port}...")
|
|
108
|
+
server_thread.start()
|
|
109
|
+
try:
|
|
110
|
+
yield
|
|
111
|
+
finally:
|
|
112
|
+
logger.debug("Stopping FTP server...")
|
|
113
|
+
server.close_all()
|
|
114
|
+
server_thread.join(timeout=10)
|
|
115
|
+
logger.debug("FTP server stopped.")
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from typing import Optional, Protocol
|
|
2
|
+
|
|
3
|
+
import pytz
|
|
4
|
+
|
|
5
|
+
from ...types import DataSample
|
|
6
|
+
from .campbell import parse_campbell_file
|
|
7
|
+
from .worldsensing import (parse_worldsensing_compact_file,
|
|
8
|
+
parse_worldsensing_standard_file)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Parser(Protocol):
|
|
12
|
+
def __call__(self, filename: str, label_to_channel_code: dict[str, str],
|
|
13
|
+
timezone: Optional[pytz.BaseTzInfo] = None) -> list[DataSample]:
|
|
14
|
+
"""
|
|
15
|
+
Parse the file and return a list of DataSample objects.
|
|
16
|
+
"""
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def detect_parser(filename: str) -> Parser:
|
|
21
|
+
"""
|
|
22
|
+
Detect the type of the file based on its content.
|
|
23
|
+
Returns one of: "worldsensing_compact", "worldsensing_standard", "campbell", or raises ValueError if unknown.
|
|
24
|
+
"""
|
|
25
|
+
with open(filename, 'r') as f:
|
|
26
|
+
first_line = f.readline().strip()
|
|
27
|
+
if first_line.startswith('"TOA5",'):
|
|
28
|
+
return parse_campbell_file
|
|
29
|
+
elif first_line.startswith('"Datalogger","compacted"'):
|
|
30
|
+
return parse_worldsensing_compact_file
|
|
31
|
+
elif first_line.startswith('"Node ID",'):
|
|
32
|
+
return parse_worldsensing_standard_file
|
|
33
|
+
else:
|
|
34
|
+
raise ValueError(f"Unknown file type for {filename}")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
__all__ = [
|
|
38
|
+
"parse_campbell_file",
|
|
39
|
+
"parse_worldsensing_standard_file",
|
|
40
|
+
"parse_worldsensing_compact_file",
|
|
41
|
+
"detect_parser",
|
|
42
|
+
]
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
import pytz
|
|
4
|
+
|
|
5
|
+
from ...types import DataSample
|
|
6
|
+
from .generic_csv import parse_generic_csv_file
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def parse_campbell_file(filename: str, label_to_channel_code: dict[str, str],
|
|
10
|
+
timezone: Optional[pytz.BaseTzInfo] = None) -> list[DataSample]:
|
|
11
|
+
return parse_generic_csv_file(
|
|
12
|
+
filename, label_to_channel_code, header_index=1, data_index=2, timezone=timezone)
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import pytz
|
|
6
|
+
from dateutil import parser
|
|
7
|
+
|
|
8
|
+
from ...types import DataSample
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _clean(s: str) -> str:
|
|
14
|
+
s = s.strip()
|
|
15
|
+
if s.startswith('"') and s.endswith('"'):
|
|
16
|
+
s = s[1:-1]
|
|
17
|
+
return s
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _clean_number(s: str) -> float | None:
|
|
21
|
+
cleaned = _clean(s)
|
|
22
|
+
if cleaned == 'NAN' or cleaned == 'N/A' or cleaned == 'NaN' or cleaned == 'nan' or cleaned == 'Nan':
|
|
23
|
+
return float('nan')
|
|
24
|
+
try:
|
|
25
|
+
return float(cleaned)
|
|
26
|
+
except ValueError:
|
|
27
|
+
return None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _parse_header(header: str) -> list[str]:
|
|
31
|
+
columns = [h.strip() for h in header.strip().split(",")]
|
|
32
|
+
if columns[0] not in ('"TIMESTAMP"', 'timestamp', 'TIMESTAMP', '"Date-and-time"'):
|
|
33
|
+
raise ValueError(
|
|
34
|
+
f"Invalid header found: {columns[0]}, expecting TIMESTAMP.")
|
|
35
|
+
# Columns have quotes around them, remove them
|
|
36
|
+
return [_clean(c) for c in columns[1:]]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _parse_csv_line(line: str, sep: str = ',', timestamp_index: int = 0) -> tuple[datetime, list[float | None]]:
|
|
40
|
+
"""
|
|
41
|
+
Returns timestamp, values
|
|
42
|
+
"""
|
|
43
|
+
values = line.strip().split(sep)
|
|
44
|
+
if len(values) < 2:
|
|
45
|
+
raise ValueError(f"Invalid number of values found: {len(values)}")
|
|
46
|
+
# First value is timestamp
|
|
47
|
+
timestamp = _clean(values[timestamp_index])
|
|
48
|
+
try:
|
|
49
|
+
dt = parser.parse(timestamp)
|
|
50
|
+
except ValueError as e:
|
|
51
|
+
raise ValueError(
|
|
52
|
+
f"Failed to parse timestamp: {timestamp} - {e}") from e
|
|
53
|
+
# Rest are values
|
|
54
|
+
return dt, [_clean_number(v) for v in values[timestamp_index+1:]]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def parse_generic_csv_file(filename: str, label_to_channel_code: dict[str, str],
|
|
58
|
+
header_index: int, data_index: int,
|
|
59
|
+
timezone: Optional[pytz.BaseTzInfo] = None) -> list[DataSample]:
|
|
60
|
+
"""
|
|
61
|
+
header index: Number of lines to skip before header
|
|
62
|
+
data index: Number of lines to skip after the header before data
|
|
63
|
+
|
|
64
|
+
We are avoiding using pandas here to keep dependencies minimal as this is often run on edge devices.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
output: list[DataSample] = []
|
|
68
|
+
with open(filename, "r") as f:
|
|
69
|
+
for _ in range(header_index):
|
|
70
|
+
next(f, None)
|
|
71
|
+
header = next(f, None)
|
|
72
|
+
if header is None:
|
|
73
|
+
logging.error(f"Failed to read header from file: {filename}")
|
|
74
|
+
return []
|
|
75
|
+
try:
|
|
76
|
+
header_columns = _parse_header(header)
|
|
77
|
+
except ValueError as e:
|
|
78
|
+
logging.error(f"Failed to parse header: {e}")
|
|
79
|
+
return []
|
|
80
|
+
|
|
81
|
+
# Next 2 lines are metadata, skip
|
|
82
|
+
for _ in range(data_index):
|
|
83
|
+
next(f, None)
|
|
84
|
+
while (line := next(f, None)):
|
|
85
|
+
try:
|
|
86
|
+
timestamp, line_values = _parse_csv_line(line)
|
|
87
|
+
except ValueError as e:
|
|
88
|
+
logging.error(f"Failed to parse line: {e}")
|
|
89
|
+
return []
|
|
90
|
+
|
|
91
|
+
if len(header_columns) != len(line_values):
|
|
92
|
+
logging.error(
|
|
93
|
+
f"Invalid number of values found: {len(line_values)}, expected {len(header_columns)}")
|
|
94
|
+
return []
|
|
95
|
+
|
|
96
|
+
if timezone is not None and timestamp.tzinfo is None:
|
|
97
|
+
timestamp = timezone.localize(timestamp)
|
|
98
|
+
|
|
99
|
+
for header, value in zip(header_columns, line_values):
|
|
100
|
+
if header not in label_to_channel_code:
|
|
101
|
+
logger.error(f"Label not found in table map: {header}")
|
|
102
|
+
continue
|
|
103
|
+
|
|
104
|
+
if value is None:
|
|
105
|
+
logger.error(
|
|
106
|
+
f"Failed to parse value: {value} for column {header}")
|
|
107
|
+
continue
|
|
108
|
+
channel_code = label_to_channel_code[header]
|
|
109
|
+
|
|
110
|
+
logger.debug(
|
|
111
|
+
f"Adding entry for label: {header} with value: {value} and timestamp: {timestamp}")
|
|
112
|
+
output.append(DataSample(timestamp=timestamp.isoformat(),
|
|
113
|
+
channel_code=channel_code, value=value))
|
|
114
|
+
return output
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
import pytz
|
|
4
|
+
|
|
5
|
+
from .generic_csv import DataSample, parse_generic_csv_file
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def parse_worldsensing_standard_file(filename: str, label_to_channel_code: dict[str, str],
|
|
9
|
+
timezone: Optional[pytz.BaseTzInfo] = None) -> list[DataSample]:
|
|
10
|
+
"""
|
|
11
|
+
Parse a worldsensing standard CSV file provided when downloading data or using standard CSV export.
|
|
12
|
+
"""
|
|
13
|
+
return parse_generic_csv_file(
|
|
14
|
+
filename, label_to_channel_code, header_index=9, data_index=0, timezone=timezone)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def parse_worldsensing_compact_file(filename: str, label_to_channel_code: dict[str, str],
|
|
18
|
+
timezone: Optional[pytz.BaseTzInfo] = None) -> list[DataSample]:
|
|
19
|
+
"""
|
|
20
|
+
Parse a worldsensing custom CSV file. These are generated when using compacted CSV mechanism.
|
|
21
|
+
"""
|
|
22
|
+
return parse_generic_csv_file(
|
|
23
|
+
filename, label_to_channel_code, header_index=1, data_index=0, timezone=timezone)
|