pixeltable 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +53 -0
- pixeltable/__version__.py +3 -0
- pixeltable/catalog/__init__.py +13 -0
- pixeltable/catalog/catalog.py +159 -0
- pixeltable/catalog/column.py +181 -0
- pixeltable/catalog/dir.py +32 -0
- pixeltable/catalog/globals.py +33 -0
- pixeltable/catalog/insertable_table.py +192 -0
- pixeltable/catalog/named_function.py +36 -0
- pixeltable/catalog/path.py +58 -0
- pixeltable/catalog/path_dict.py +139 -0
- pixeltable/catalog/schema_object.py +39 -0
- pixeltable/catalog/table.py +695 -0
- pixeltable/catalog/table_version.py +1026 -0
- pixeltable/catalog/table_version_path.py +133 -0
- pixeltable/catalog/view.py +203 -0
- pixeltable/dataframe.py +749 -0
- pixeltable/env.py +466 -0
- pixeltable/exceptions.py +17 -0
- pixeltable/exec/__init__.py +10 -0
- pixeltable/exec/aggregation_node.py +78 -0
- pixeltable/exec/cache_prefetch_node.py +116 -0
- pixeltable/exec/component_iteration_node.py +79 -0
- pixeltable/exec/data_row_batch.py +94 -0
- pixeltable/exec/exec_context.py +22 -0
- pixeltable/exec/exec_node.py +61 -0
- pixeltable/exec/expr_eval_node.py +217 -0
- pixeltable/exec/in_memory_data_node.py +73 -0
- pixeltable/exec/media_validation_node.py +43 -0
- pixeltable/exec/sql_scan_node.py +226 -0
- pixeltable/exprs/__init__.py +25 -0
- pixeltable/exprs/arithmetic_expr.py +102 -0
- pixeltable/exprs/array_slice.py +71 -0
- pixeltable/exprs/column_property_ref.py +77 -0
- pixeltable/exprs/column_ref.py +114 -0
- pixeltable/exprs/comparison.py +77 -0
- pixeltable/exprs/compound_predicate.py +98 -0
- pixeltable/exprs/data_row.py +199 -0
- pixeltable/exprs/expr.py +594 -0
- pixeltable/exprs/expr_set.py +39 -0
- pixeltable/exprs/function_call.py +382 -0
- pixeltable/exprs/globals.py +69 -0
- pixeltable/exprs/image_member_access.py +96 -0
- pixeltable/exprs/in_predicate.py +96 -0
- pixeltable/exprs/inline_array.py +109 -0
- pixeltable/exprs/inline_dict.py +103 -0
- pixeltable/exprs/is_null.py +38 -0
- pixeltable/exprs/json_mapper.py +121 -0
- pixeltable/exprs/json_path.py +159 -0
- pixeltable/exprs/literal.py +66 -0
- pixeltable/exprs/object_ref.py +41 -0
- pixeltable/exprs/predicate.py +44 -0
- pixeltable/exprs/row_builder.py +329 -0
- pixeltable/exprs/rowid_ref.py +94 -0
- pixeltable/exprs/similarity_expr.py +65 -0
- pixeltable/exprs/type_cast.py +53 -0
- pixeltable/exprs/variable.py +45 -0
- pixeltable/ext/__init__.py +5 -0
- pixeltable/ext/functions/yolox.py +92 -0
- pixeltable/func/__init__.py +7 -0
- pixeltable/func/aggregate_function.py +197 -0
- pixeltable/func/callable_function.py +113 -0
- pixeltable/func/expr_template_function.py +99 -0
- pixeltable/func/function.py +141 -0
- pixeltable/func/function_registry.py +227 -0
- pixeltable/func/globals.py +46 -0
- pixeltable/func/nos_function.py +202 -0
- pixeltable/func/signature.py +162 -0
- pixeltable/func/udf.py +164 -0
- pixeltable/functions/__init__.py +95 -0
- pixeltable/functions/eval.py +215 -0
- pixeltable/functions/fireworks.py +34 -0
- pixeltable/functions/huggingface.py +167 -0
- pixeltable/functions/image.py +16 -0
- pixeltable/functions/openai.py +289 -0
- pixeltable/functions/pil/image.py +147 -0
- pixeltable/functions/string.py +13 -0
- pixeltable/functions/together.py +143 -0
- pixeltable/functions/util.py +52 -0
- pixeltable/functions/video.py +62 -0
- pixeltable/globals.py +425 -0
- pixeltable/index/__init__.py +2 -0
- pixeltable/index/base.py +51 -0
- pixeltable/index/embedding_index.py +168 -0
- pixeltable/io/__init__.py +3 -0
- pixeltable/io/hf_datasets.py +188 -0
- pixeltable/io/pandas.py +148 -0
- pixeltable/io/parquet.py +192 -0
- pixeltable/iterators/__init__.py +3 -0
- pixeltable/iterators/base.py +52 -0
- pixeltable/iterators/document.py +432 -0
- pixeltable/iterators/video.py +88 -0
- pixeltable/metadata/__init__.py +58 -0
- pixeltable/metadata/converters/convert_10.py +18 -0
- pixeltable/metadata/converters/convert_12.py +3 -0
- pixeltable/metadata/converters/convert_13.py +41 -0
- pixeltable/metadata/schema.py +234 -0
- pixeltable/plan.py +620 -0
- pixeltable/store.py +424 -0
- pixeltable/tool/create_test_db_dump.py +184 -0
- pixeltable/tool/create_test_video.py +81 -0
- pixeltable/type_system.py +846 -0
- pixeltable/utils/__init__.py +17 -0
- pixeltable/utils/arrow.py +98 -0
- pixeltable/utils/clip.py +18 -0
- pixeltable/utils/coco.py +136 -0
- pixeltable/utils/documents.py +69 -0
- pixeltable/utils/filecache.py +195 -0
- pixeltable/utils/help.py +11 -0
- pixeltable/utils/http_server.py +70 -0
- pixeltable/utils/media_store.py +76 -0
- pixeltable/utils/pytorch.py +91 -0
- pixeltable/utils/s3.py +13 -0
- pixeltable/utils/sql.py +17 -0
- pixeltable/utils/transactional_directory.py +35 -0
- pixeltable-0.0.0.dist-info/LICENSE +18 -0
- pixeltable-0.0.0.dist-info/METADATA +131 -0
- pixeltable-0.0.0.dist-info/RECORD +119 -0
- pixeltable-0.0.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
def print_perf_counter_delta(delta: float) -> str:
|
|
2
|
+
"""Prints a performance counter delta in a human-readable format.
|
|
3
|
+
|
|
4
|
+
Args:
|
|
5
|
+
delta: delta in seconds
|
|
6
|
+
|
|
7
|
+
Returns:
|
|
8
|
+
Human-readable string
|
|
9
|
+
"""
|
|
10
|
+
if delta < 1e-6:
|
|
11
|
+
return f'{delta * 1e9:.2f} ns'
|
|
12
|
+
elif delta < 1e-3:
|
|
13
|
+
return f'{delta * 1e6:.2f} us'
|
|
14
|
+
elif delta < 1:
|
|
15
|
+
return f'{delta * 1e3:.2f} ms'
|
|
16
|
+
else:
|
|
17
|
+
return f'{delta:.2f} s'
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Dict, Iterable, Iterator, Optional
|
|
3
|
+
|
|
4
|
+
import pyarrow as pa
|
|
5
|
+
|
|
6
|
+
import pixeltable.type_system as ts
|
|
7
|
+
|
|
8
|
+
_logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
_pa_to_pt: Dict[pa.DataType, ts.ColumnType] = {
|
|
11
|
+
pa.string(): ts.StringType(nullable=True),
|
|
12
|
+
pa.timestamp('us'): ts.TimestampType(nullable=True),
|
|
13
|
+
pa.bool_(): ts.BoolType(nullable=True),
|
|
14
|
+
pa.uint8(): ts.IntType(nullable=True),
|
|
15
|
+
pa.int8(): ts.IntType(nullable=True),
|
|
16
|
+
pa.uint32(): ts.IntType(nullable=True),
|
|
17
|
+
pa.uint64(): ts.IntType(nullable=True),
|
|
18
|
+
pa.int32(): ts.IntType(nullable=True),
|
|
19
|
+
pa.int64(): ts.IntType(nullable=True),
|
|
20
|
+
pa.float32(): ts.FloatType(nullable=True),
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
_pt_to_pa: Dict[ts.ColumnType, pa.DataType] = {
|
|
24
|
+
ts.StringType: pa.string(),
|
|
25
|
+
ts.TimestampType: pa.timestamp('us'), # postgres timestamp is microseconds
|
|
26
|
+
ts.BoolType: pa.bool_(),
|
|
27
|
+
ts.IntType: pa.int64(),
|
|
28
|
+
ts.FloatType: pa.float32(),
|
|
29
|
+
ts.JsonType: pa.string(), # TODO(orm) pa.struct() is possible
|
|
30
|
+
ts.ImageType: pa.binary(), # inline image
|
|
31
|
+
ts.AudioType: pa.string(), # path
|
|
32
|
+
ts.VideoType: pa.string(), # path
|
|
33
|
+
ts.DocumentType: pa.string(), # path
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def to_pixeltable_type(arrow_type: pa.DataType) -> Optional[ts.ColumnType]:
|
|
38
|
+
"""Convert a pyarrow DataType to a pixeltable ColumnType if one is defined.
|
|
39
|
+
Returns None if no conversion is currently implemented.
|
|
40
|
+
"""
|
|
41
|
+
if arrow_type in _pa_to_pt:
|
|
42
|
+
return _pa_to_pt[arrow_type]
|
|
43
|
+
elif isinstance(arrow_type, pa.FixedShapeTensorType):
|
|
44
|
+
dtype = to_pixeltable_type(arrow_type.value_type)
|
|
45
|
+
if dtype is None:
|
|
46
|
+
return None
|
|
47
|
+
return ts.ArrayType(shape=arrow_type.shape, dtype=dtype)
|
|
48
|
+
else:
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
|
|
53
|
+
"""Convert a pixeltable DataType to a pyarrow datatype if one is defined.
|
|
54
|
+
Returns None if no conversion is currently implemented.
|
|
55
|
+
"""
|
|
56
|
+
if pixeltable_type.__class__ in _pt_to_pa:
|
|
57
|
+
return _pt_to_pa[pixeltable_type.__class__]
|
|
58
|
+
elif isinstance(pixeltable_type, ts.ArrayType):
|
|
59
|
+
return pa.fixed_shape_tensor(pa.from_numpy_dtype(pixeltable_type.numpy_dtype()), pixeltable_type.shape)
|
|
60
|
+
else:
|
|
61
|
+
return None
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def to_pixeltable_schema(arrow_schema: pa.Schema) -> Dict[str, ts.ColumnType]:
|
|
65
|
+
return {field.name: to_pixeltable_type(field.type) for field in arrow_schema}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def to_arrow_schema(pixeltable_schema: Dict[str, Any]) -> pa.Schema:
|
|
69
|
+
return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items())
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def to_pydict(batch: pa.RecordBatch) -> Dict[str, Iterable[Any]]:
|
|
73
|
+
"""Convert a RecordBatch to a dictionary of lists, unlike pa.lib.RecordBatch.to_pydict,
|
|
74
|
+
this function will not convert numpy arrays to lists, and will preserve the original numpy dtype.
|
|
75
|
+
"""
|
|
76
|
+
out = {}
|
|
77
|
+
for k, name in enumerate(batch.schema.names):
|
|
78
|
+
col = batch.column(k)
|
|
79
|
+
if isinstance(col.type, pa.FixedShapeTensorType):
|
|
80
|
+
# treat array columns as numpy arrays to easily preserve numpy type
|
|
81
|
+
out[name] = col.to_numpy(zero_copy_only=False)
|
|
82
|
+
else:
|
|
83
|
+
# for the rest, use pydict to preserve python types
|
|
84
|
+
out[name] = col.to_pylist()
|
|
85
|
+
|
|
86
|
+
return out
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def iter_tuples(batch: pa.RecordBatch) -> Iterator[Dict[str, Any]]:
|
|
90
|
+
"""Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
|
|
91
|
+
pydict = to_pydict(batch)
|
|
92
|
+
assert len(pydict) > 0, 'empty record batch'
|
|
93
|
+
for _, v in pydict.items():
|
|
94
|
+
batch_size = len(v)
|
|
95
|
+
break
|
|
96
|
+
|
|
97
|
+
for i in range(batch_size):
|
|
98
|
+
yield {col_name: values[i] for col_name, values in pydict.items()}
|
pixeltable/utils/clip.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import PIL.Image
|
|
3
|
+
|
|
4
|
+
import pixeltable.func as func
|
|
5
|
+
from pixeltable.env import Env
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def embed_image(img: PIL.Image.Image) -> np.ndarray:
|
|
9
|
+
from pixeltable.functions.nos.image_embedding import openai_clip
|
|
10
|
+
model_info = openai_clip.model_spec
|
|
11
|
+
result = Env.get().nos_client.Run(task=model_info.task, model_name=model_info.name, images=[img.resize((224, 224))])
|
|
12
|
+
return result['embedding'].squeeze(0)
|
|
13
|
+
|
|
14
|
+
def embed_text(text: str) -> np.ndarray:
|
|
15
|
+
from pixeltable.functions.nos.text_embedding import openai_clip
|
|
16
|
+
model_info = openai_clip.model_spec
|
|
17
|
+
result = Env.get().nos_client.Run(task=model_info.task, model_name=model_info.name, texts=[text])
|
|
18
|
+
return result['embedding'].squeeze(0)
|
pixeltable/utils/coco.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
from typing import List, Dict, Any, Set
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
import PIL
|
|
6
|
+
|
|
7
|
+
import pixeltable.exceptions as excs
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
format_msg = """
|
|
11
|
+
|
|
12
|
+
Required format:
|
|
13
|
+
{
|
|
14
|
+
'image': PIL.Image.Image,
|
|
15
|
+
'annotations': [
|
|
16
|
+
{
|
|
17
|
+
'bbox': [x: int, y: int, w: int, h: int],
|
|
18
|
+
'category': str | int,
|
|
19
|
+
},
|
|
20
|
+
...
|
|
21
|
+
],
|
|
22
|
+
}
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def _verify_input_dict(input_dict: Dict[str, Any]) -> None:
|
|
26
|
+
"""Verify that input_dict is a valid input dict for write_coco_dataset()"""
|
|
27
|
+
if not isinstance(input_dict, dict):
|
|
28
|
+
raise excs.Error(f'Expected dict, got {input_dict}{format_msg}')
|
|
29
|
+
if 'image' not in input_dict:
|
|
30
|
+
raise excs.Error(f'Missing key "image" in input dict: {input_dict}{format_msg}')
|
|
31
|
+
if not isinstance(input_dict['image'], PIL.Image.Image):
|
|
32
|
+
raise excs.Error(f'Value for "image" is not a PIL.Image.Image: {input_dict}{format_msg}')
|
|
33
|
+
if 'annotations' not in input_dict:
|
|
34
|
+
raise excs.Error(f'Missing key "annotations" in input dict: {input_dict}{format_msg}')
|
|
35
|
+
if not isinstance(input_dict['annotations'], list):
|
|
36
|
+
raise excs.Error(f'Value for "annotations" is not a list: {input_dict}{format_msg}')
|
|
37
|
+
for annotation in input_dict['annotations']:
|
|
38
|
+
if not isinstance(annotation, dict):
|
|
39
|
+
raise excs.Error(f'Annotation is not a dict: {annotation}{format_msg}')
|
|
40
|
+
if 'bbox' not in annotation:
|
|
41
|
+
raise excs.Error(f'Missing key "bbox" in annotation: {annotation}{format_msg}')
|
|
42
|
+
if not isinstance(annotation['bbox'], list):
|
|
43
|
+
raise excs.Error(f'Value for "bbox" is not a list [x, y, w, h]: {annotation}{format_msg}')
|
|
44
|
+
if len(annotation['bbox']) != 4 or not all(isinstance(x, int) for x in annotation['bbox']):
|
|
45
|
+
raise excs.Error(f'Key "bbox" is not a list [x, y, w, h] of ints: {annotation}{format_msg}')
|
|
46
|
+
if 'category' not in annotation:
|
|
47
|
+
raise excs.Error(f'Missing key "category" in annotation: {annotation}{format_msg}')
|
|
48
|
+
if not isinstance(annotation['category'], (str, int)):
|
|
49
|
+
raise excs.Error(f'Value for "category" is not a str or int: {annotation}{format_msg}')
|
|
50
|
+
|
|
51
|
+
def write_coco_dataset(df: 'pixeltable.DataFrame', dest_path: Path) -> Path:
|
|
52
|
+
"""Export a DataFrame result set as a COCO dataset in dest_path and return the path of the data.json file."""
|
|
53
|
+
# TODO: validate schema
|
|
54
|
+
if len(df._select_list_exprs) != 1 or not df._select_list_exprs[0].col_type.is_json_type():
|
|
55
|
+
raise excs.Error(f'Expected exactly one json-typed column in select list: {df._select_list_exprs}')
|
|
56
|
+
input_dict_slot_idx = -1 # df._select_list_exprs[0].slot_idx isn't valid until _exec()
|
|
57
|
+
|
|
58
|
+
# create output dir
|
|
59
|
+
assert not dest_path.exists()
|
|
60
|
+
dest_path.mkdir(parents=False)
|
|
61
|
+
images_dir = dest_path / 'images'
|
|
62
|
+
images_dir.mkdir()
|
|
63
|
+
|
|
64
|
+
images: List[Dict[str, Any]] = []
|
|
65
|
+
img_id = -1
|
|
66
|
+
annotations: List[Dict[str, Any]] = []
|
|
67
|
+
ann_id = -1
|
|
68
|
+
categories: Set[Any] = set()
|
|
69
|
+
for input_row in df._exec():
|
|
70
|
+
if input_dict_slot_idx == -1:
|
|
71
|
+
input_dict_expr = df._select_list_exprs[0]
|
|
72
|
+
input_dict_slot_idx = input_dict_expr.slot_idx
|
|
73
|
+
input_dict = input_row[input_dict_slot_idx]
|
|
74
|
+
_verify_input_dict(input_dict)
|
|
75
|
+
|
|
76
|
+
# we want to know the slot idx of the image used in the input dict, so that we can check whether we
|
|
77
|
+
# already have a local path for it
|
|
78
|
+
input_dict_dependencies = input_dict_expr.dependencies()
|
|
79
|
+
img_slot_idx = next((e.slot_idx for e in input_dict_dependencies if e.col_type.is_image_type()), None)
|
|
80
|
+
assert img_slot_idx is not None
|
|
81
|
+
else:
|
|
82
|
+
input_dict = input_row[input_dict_slot_idx]
|
|
83
|
+
_verify_input_dict(input_dict)
|
|
84
|
+
|
|
85
|
+
# create image record
|
|
86
|
+
img_id += 1
|
|
87
|
+
|
|
88
|
+
# get a local path for the image
|
|
89
|
+
img = input_dict['image']
|
|
90
|
+
if input_row.file_paths[img_slot_idx] is not None:
|
|
91
|
+
# we already have a local path
|
|
92
|
+
img_path = Path(input_row.file_paths[img_slot_idx])
|
|
93
|
+
# TODO: if the path leads to our tmp dir, we need to move the file
|
|
94
|
+
else:
|
|
95
|
+
# we need to create a local path
|
|
96
|
+
img_path = images_dir / f'{img_id}.jpg'
|
|
97
|
+
img.save(img_path)
|
|
98
|
+
|
|
99
|
+
images.append({
|
|
100
|
+
'id': img_id,
|
|
101
|
+
'file_name': str(img_path),
|
|
102
|
+
'width': img.width,
|
|
103
|
+
'height': img.height,
|
|
104
|
+
})
|
|
105
|
+
|
|
106
|
+
# create annotation records for this image
|
|
107
|
+
for annotation in input_dict['annotations']:
|
|
108
|
+
ann_id += 1
|
|
109
|
+
x, y, w, h = annotation['bbox']
|
|
110
|
+
category = annotation['category']
|
|
111
|
+
categories.add(category)
|
|
112
|
+
annotations.append({
|
|
113
|
+
'id': ann_id,
|
|
114
|
+
'image_id': img_id,
|
|
115
|
+
# we use the category name here and fix it up at the end, when we have assigned category ids
|
|
116
|
+
'category_id': category,
|
|
117
|
+
'bbox': annotation['bbox'],
|
|
118
|
+
'area': w * h,
|
|
119
|
+
'iscrowd': 0,
|
|
120
|
+
})
|
|
121
|
+
|
|
122
|
+
# replace category names with ids
|
|
123
|
+
category_ids = {category: id for id, category in enumerate(sorted(list(categories)))}
|
|
124
|
+
for annotation in annotations:
|
|
125
|
+
annotation['category_id'] = category_ids[annotation['category_id']]
|
|
126
|
+
|
|
127
|
+
result = {
|
|
128
|
+
'images': images,
|
|
129
|
+
'annotations': annotations,
|
|
130
|
+
'categories': [{'id': id, 'name': category} for category, id in category_ids.items()],
|
|
131
|
+
}
|
|
132
|
+
output_path = dest_path / 'data.json'
|
|
133
|
+
with open(output_path, 'w') as f:
|
|
134
|
+
json.dump(result, f)
|
|
135
|
+
return output_path
|
|
136
|
+
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from typing import Optional, Dict
|
|
2
|
+
import dataclasses
|
|
3
|
+
|
|
4
|
+
import pixeltable.type_system as ts
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclasses.dataclass
|
|
8
|
+
class DocumentHandle:
|
|
9
|
+
format: ts.DocumentType.DocumentFormat
|
|
10
|
+
bs_doc: Optional['bs4.BeautifulSoup'] = None
|
|
11
|
+
md_ast: Optional[Dict] = None
|
|
12
|
+
pdf_doc: Optional['fitz.Document'] = None
|
|
13
|
+
|
|
14
|
+
def get_document_handle(path: str) -> Optional[DocumentHandle]:
|
|
15
|
+
# try pdf first, because a correct PDF is a binary format that
|
|
16
|
+
# would trigger encoding exceptions if oppened as utf8.
|
|
17
|
+
pdf_doc = get_pdf_handle(path)
|
|
18
|
+
if pdf_doc is not None:
|
|
19
|
+
return DocumentHandle(format=ts.DocumentType.DocumentFormat.PDF, pdf_doc=pdf_doc)
|
|
20
|
+
# currently the rest of the types are text-based, so we can open them in utf8 mode once
|
|
21
|
+
try:
|
|
22
|
+
with open(path, 'r', encoding='utf8') as file:
|
|
23
|
+
contents = file.read()
|
|
24
|
+
except UnicodeDecodeError:
|
|
25
|
+
# not pdf, and also not valid text file
|
|
26
|
+
return None
|
|
27
|
+
|
|
28
|
+
# bs4 will appear to succeed for md files as well.
|
|
29
|
+
# this will break most markdown files at the moment.
|
|
30
|
+
bs_doc = get_html_handle(contents)
|
|
31
|
+
if bs_doc is not None:
|
|
32
|
+
return DocumentHandle(format=ts.DocumentType.DocumentFormat.HTML, bs_doc=bs_doc)
|
|
33
|
+
|
|
34
|
+
md_ast = get_markdown_handle(contents)
|
|
35
|
+
if md_ast is not None:
|
|
36
|
+
return DocumentHandle(format=ts.DocumentType.DocumentFormat.MD, md_ast=md_ast)
|
|
37
|
+
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
def get_html_handle(text: str) -> Optional['bs4.BeautifulSoup']:
|
|
41
|
+
import bs4
|
|
42
|
+
try:
|
|
43
|
+
doc = bs4.BeautifulSoup(text, 'html.parser')
|
|
44
|
+
if doc.find() is None:
|
|
45
|
+
return None
|
|
46
|
+
return doc
|
|
47
|
+
except Exception:
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
def get_markdown_handle(text: str) -> Optional[Dict]:
|
|
51
|
+
import mistune
|
|
52
|
+
try:
|
|
53
|
+
md_ast = mistune.create_markdown(renderer=None)
|
|
54
|
+
return md_ast(text)
|
|
55
|
+
except Exception:
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
def get_pdf_handle(path : str) -> Optional['fitz.Document']:
|
|
59
|
+
import fitz # aka pymupdf
|
|
60
|
+
try:
|
|
61
|
+
doc = fitz.open(path)
|
|
62
|
+
# check pdf (bc it will work for images)
|
|
63
|
+
if not doc.is_pdf:
|
|
64
|
+
return None
|
|
65
|
+
# try to read one page
|
|
66
|
+
next(page for page in doc)
|
|
67
|
+
return doc
|
|
68
|
+
except Exception:
|
|
69
|
+
return None
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Optional, List, Tuple, Dict
|
|
3
|
+
from collections import OrderedDict, defaultdict, namedtuple
|
|
4
|
+
import os
|
|
5
|
+
import glob
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from time import time
|
|
8
|
+
import logging
|
|
9
|
+
from uuid import UUID
|
|
10
|
+
import hashlib
|
|
11
|
+
|
|
12
|
+
from pixeltable.env import Env
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
_logger = logging.getLogger('pixeltable')
|
|
16
|
+
|
|
17
|
+
class CacheEntry:
|
|
18
|
+
def __init__(self, key: str, tbl_id: UUID, col_id: int, size: int, last_accessed_ts: int, ext: str):
|
|
19
|
+
self.key = key
|
|
20
|
+
self.tbl_id = tbl_id
|
|
21
|
+
self.col_id = col_id
|
|
22
|
+
self.size = size
|
|
23
|
+
self.last_accessed_ts = last_accessed_ts
|
|
24
|
+
self.ext = ext
|
|
25
|
+
|
|
26
|
+
def path(self) -> Path:
|
|
27
|
+
return Env.get().file_cache_dir / f'{self.tbl_id.hex}_{self.col_id}_{self.key}{self.ext}'
|
|
28
|
+
|
|
29
|
+
@classmethod
|
|
30
|
+
def from_file(cls, path: Path) -> CacheEntry:
|
|
31
|
+
components = path.stem.split('_')
|
|
32
|
+
assert len(components) == 3
|
|
33
|
+
tbl_id = UUID(components[0])
|
|
34
|
+
col_id = int(components[1])
|
|
35
|
+
key = components[2]
|
|
36
|
+
file_info = os.stat(str(path))
|
|
37
|
+
return cls(key, tbl_id, col_id, file_info.st_size, file_info.st_mtime, path.suffix)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class FileCache:
|
|
41
|
+
"""
|
|
42
|
+
A local cache of external (eg, S3) file references in cells of a stored table (ie, table or view).
|
|
43
|
+
|
|
44
|
+
Cache entries are identified by a hash of the file url and stored in Env.filecache_dir. The time of last
|
|
45
|
+
access of a cache entries is its file's mtime.
|
|
46
|
+
|
|
47
|
+
TODO:
|
|
48
|
+
- enforce a maximum capacity with LRU eviction
|
|
49
|
+
- implement MRU eviction for queries that exceed the capacity
|
|
50
|
+
"""
|
|
51
|
+
_instance: Optional[FileCache] = None
|
|
52
|
+
ColumnStats = namedtuple('FileCacheColumnStats', ['tbl_id', 'col_id', 'num_files', 'total_size'])
|
|
53
|
+
CacheStats = namedtuple(
|
|
54
|
+
'FileCacheStats', ['total_size', 'num_requests', 'num_hits', 'num_evictions', 'column_stats'])
|
|
55
|
+
|
|
56
|
+
@classmethod
|
|
57
|
+
def get(cls) -> FileCache:
|
|
58
|
+
if cls._instance is None:
|
|
59
|
+
cls._instance = cls()
|
|
60
|
+
return cls._instance
|
|
61
|
+
|
|
62
|
+
def __init__(self):
|
|
63
|
+
self.cache: OrderedDict[str, CacheEntry] = OrderedDict() # ordered by entry.last_accessed_ts
|
|
64
|
+
self.total_size = 0
|
|
65
|
+
#self.capacity = Env.get().max_filecache_size
|
|
66
|
+
self.num_requests = 0
|
|
67
|
+
self.num_hits = 0
|
|
68
|
+
self.num_evictions = 0
|
|
69
|
+
paths = glob.glob(str(Env.get().file_cache_dir / '*'))
|
|
70
|
+
entries = [CacheEntry.from_file(Path(path_str)) for path_str in paths]
|
|
71
|
+
# we need to insert entries in order of last_accessed_ts
|
|
72
|
+
entries.sort(key=lambda e: e.last_accessed_ts)
|
|
73
|
+
for entry in entries:
|
|
74
|
+
self.cache[entry.key] = entry
|
|
75
|
+
self.total_size += entry.size
|
|
76
|
+
|
|
77
|
+
def avg_file_size(self) -> int:
|
|
78
|
+
if len(self.cache) == 0:
|
|
79
|
+
return 0
|
|
80
|
+
return int(self.total_size / len(self.cache))
|
|
81
|
+
|
|
82
|
+
def num_files(self, tbl_id: Optional[UUID] = None) -> int:
|
|
83
|
+
if tbl_id is None:
|
|
84
|
+
return len(self.cache)
|
|
85
|
+
entries = [e for e in self.cache.values() if e.tbl_id == tbl_id]
|
|
86
|
+
return len(entries)
|
|
87
|
+
|
|
88
|
+
def clear(self, tbl_id: Optional[UUID] = None, capacity: Optional[int] = None) -> None:
|
|
89
|
+
"""
|
|
90
|
+
For testing purposes: allow resetting capacity and stats.
|
|
91
|
+
"""
|
|
92
|
+
self.num_requests, self.num_hits, self.num_evictions = 0, 0, 0
|
|
93
|
+
entries = list(self.cache.values()) # list(): avoid dealing with values() return type
|
|
94
|
+
if tbl_id is not None:
|
|
95
|
+
entries = [e for e in entries if e.tbl_id == tbl_id]
|
|
96
|
+
_logger.debug(f'clearing {len(entries)} entries from file cache for table {tbl_id}')
|
|
97
|
+
else:
|
|
98
|
+
_logger.debug(f'clearing {len(entries)} entries from file cache')
|
|
99
|
+
for entry in entries:
|
|
100
|
+
del self.cache[entry.key]
|
|
101
|
+
self.total_size -= entry.size
|
|
102
|
+
os.remove(entry.path())
|
|
103
|
+
# if capacity is not None:
|
|
104
|
+
# self.capacity = capacity
|
|
105
|
+
# else:
|
|
106
|
+
# # need to reset to default
|
|
107
|
+
# self.capacity = Env.get().max_filecache_size
|
|
108
|
+
# _logger.debug(f'setting file cache capacity to {self.capacity}')
|
|
109
|
+
|
|
110
|
+
def _url_hash(self, url: str) -> str:
|
|
111
|
+
h = hashlib.sha256()
|
|
112
|
+
h.update(url.encode())
|
|
113
|
+
return h.hexdigest()
|
|
114
|
+
|
|
115
|
+
def lookup(self, url: str) -> Optional[Path]:
|
|
116
|
+
self.num_requests += 1
|
|
117
|
+
key = self._url_hash(url)
|
|
118
|
+
entry = self.cache.get(key, None)
|
|
119
|
+
if entry is None:
|
|
120
|
+
_logger.debug(f'file cache miss for {url}')
|
|
121
|
+
return None
|
|
122
|
+
# update mtime and cache
|
|
123
|
+
path = entry.path()
|
|
124
|
+
path.touch(exist_ok=True)
|
|
125
|
+
file_info = os.stat(str(path))
|
|
126
|
+
entry.last_accessed_ts = file_info.st_mtime
|
|
127
|
+
self.cache.move_to_end(key, last=True)
|
|
128
|
+
self.num_hits += 1
|
|
129
|
+
_logger.debug(f'file cache hit for {url}')
|
|
130
|
+
return path
|
|
131
|
+
|
|
132
|
+
# def can_admit(self, query_ts: int) -> bool:
|
|
133
|
+
# if self.total_size + self.avg_file_size <= self.capacity:
|
|
134
|
+
# return True
|
|
135
|
+
# assert len(self.cache) > 0
|
|
136
|
+
# # check whether we can evict the current lru entry
|
|
137
|
+
# lru_entry = next(iter(self.cache.values()))
|
|
138
|
+
# if lru_entry.last_accessed_ts >= query_ts:
|
|
139
|
+
# # the current query brought this entry in: we're not going to evict it
|
|
140
|
+
# return False
|
|
141
|
+
# return True
|
|
142
|
+
|
|
143
|
+
def add(self, tbl_id: UUID, col_id: int, url: str, path: Path) -> Path:
|
|
144
|
+
"""Adds url at 'path' to cache and returns its new path.
|
|
145
|
+
'path' will not be accessible after this call. Retains the extension of 'path'.
|
|
146
|
+
"""
|
|
147
|
+
file_info = os.stat(str(path))
|
|
148
|
+
_ = time()
|
|
149
|
+
#if self.total_size + file_info.st_size > self.capacity:
|
|
150
|
+
if False:
|
|
151
|
+
if len(self.cache) == 0:
|
|
152
|
+
# nothing to evict
|
|
153
|
+
return
|
|
154
|
+
# evict entries until we're below the limit or until we run into entries the current query brought in
|
|
155
|
+
while True:
|
|
156
|
+
lru_entry = next(iter(self.cache.values()))
|
|
157
|
+
if lru_entry.last_accessed_ts >= query_ts:
|
|
158
|
+
# the current query brought this entry in: switch to MRU and ignore this put()
|
|
159
|
+
_logger.debug('file cache switched to MRU')
|
|
160
|
+
return
|
|
161
|
+
self.cache.popitem(last=False)
|
|
162
|
+
self.total_size -= lru_entry.size
|
|
163
|
+
self.num_evictions += 1
|
|
164
|
+
os.remove(str(lru_entry.path()))
|
|
165
|
+
_logger.debug(f'evicted entry for cell {lru_entry.cell_id} from file cache')
|
|
166
|
+
if self.total_size + file_info.st_size <= self.capacity:
|
|
167
|
+
break
|
|
168
|
+
|
|
169
|
+
key = self._url_hash(url)
|
|
170
|
+
assert key not in self.cache
|
|
171
|
+
entry = CacheEntry(key, tbl_id, col_id, file_info.st_size, file_info.st_mtime, path.suffix)
|
|
172
|
+
self.cache[key] = entry
|
|
173
|
+
self.total_size += entry.size
|
|
174
|
+
new_path = entry.path()
|
|
175
|
+
os.rename(str(path), str(new_path))
|
|
176
|
+
_logger.debug(f'added entry for cell {url} to file cache')
|
|
177
|
+
return new_path
|
|
178
|
+
|
|
179
|
+
def stats(self) -> CacheStats:
|
|
180
|
+
# collect column stats
|
|
181
|
+
# (tbl_id, col_id) -> (num_files, total_size)
|
|
182
|
+
d: Dict[Tuple[int, int], List[int]] = defaultdict(lambda: [0, 0])
|
|
183
|
+
for entry in self.cache.values():
|
|
184
|
+
t = d[(entry.tbl_id, entry.col_id)]
|
|
185
|
+
t[0] += 1
|
|
186
|
+
t[1] += entry.size
|
|
187
|
+
col_stats = [
|
|
188
|
+
self.ColumnStats(tbl_id, col_id, num_files, size) for (tbl_id, col_id), (num_files, size) in d.items()
|
|
189
|
+
]
|
|
190
|
+
col_stats.sort(key=lambda e: e[3], reverse=True)
|
|
191
|
+
return self.CacheStats(self.total_size, self.num_requests, self.num_hits, self.num_evictions, col_stats)
|
|
192
|
+
|
|
193
|
+
def debug_print(self) -> None:
|
|
194
|
+
for entry in self.cache.values():
|
|
195
|
+
print(f'CacheEntry: tbl_id={entry.tbl_id}, col_id={entry.col_id}, size={entry.size}')
|
pixeltable/utils/help.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import http
|
|
2
|
+
import http.server
|
|
3
|
+
import logging
|
|
4
|
+
import urllib
|
|
5
|
+
import posixpath
|
|
6
|
+
import pathlib
|
|
7
|
+
import os
|
|
8
|
+
import string
|
|
9
|
+
|
|
10
|
+
_logger = logging.getLogger('pixeltable.http.server')
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_file_uri(http_address: str, file_path: str) -> str:
|
|
14
|
+
"""Get the URI for a file path, with the given prefix.
|
|
15
|
+
Used in the client to generate a URI
|
|
16
|
+
"""
|
|
17
|
+
abs_path = pathlib.Path(file_path)
|
|
18
|
+
assert abs_path.is_absolute()
|
|
19
|
+
url = urllib.request.pathname2url(str(abs_path))
|
|
20
|
+
return f'{http_address}{url}'
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class AbsolutePathHandler(http.server.SimpleHTTPRequestHandler):
|
|
24
|
+
"""Serves all absolute paths, not just the current directory"""
|
|
25
|
+
def translate_path(self, path: str) -> str:
|
|
26
|
+
"""
|
|
27
|
+
Translate a /-separated PATH to the local filename syntax.
|
|
28
|
+
overrides http.server.SimpleHTTPRequestHandler.translate_path
|
|
29
|
+
|
|
30
|
+
This is only useful for file serving.
|
|
31
|
+
|
|
32
|
+
Code initially taken from there:
|
|
33
|
+
https://github.com/python/cpython/blob/f5406ef454662b98df107775d18ff71ae6849618/Lib/http/server.py#L834
|
|
34
|
+
"""
|
|
35
|
+
_logger.info(f'translate path {path=}')
|
|
36
|
+
# abandon query parameters, taken from http.server.SimpleHTTPRequestHandler
|
|
37
|
+
path = path.split('?', 1)[0]
|
|
38
|
+
path = path.split('#', 1)[0]
|
|
39
|
+
|
|
40
|
+
path = pathlib.Path(urllib.request.url2pathname(path))
|
|
41
|
+
return str(path)
|
|
42
|
+
|
|
43
|
+
def log_message(self, format, *args) -> None:
|
|
44
|
+
"""override logging to stderr in http.server.BaseHTTPRequestHandler"""
|
|
45
|
+
message = format % args
|
|
46
|
+
_logger.info(message.translate(self._control_char_table))
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class LoggingHTTPServer(http.server.ThreadingHTTPServer):
|
|
50
|
+
"""Avoids polluting stdout and stderr"""
|
|
51
|
+
|
|
52
|
+
def handle_error(self, request, client_address) -> None:
|
|
53
|
+
"""override socketserver.TCPServer.handle_error which prints directly to sys.stderr"""
|
|
54
|
+
import traceback
|
|
55
|
+
|
|
56
|
+
_logger.error(
|
|
57
|
+
f'Exception occurred during processing of {request=} from {client_address=}\
|
|
58
|
+
\nbacktrace:\n{traceback.format_exc()}\n----\n'
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def make_server(address: str, port: int) -> http.server.HTTPServer:
|
|
63
|
+
"""Create a file server with pixeltable specific config """
|
|
64
|
+
return LoggingHTTPServer((address, port), AbsolutePathHandler)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
if __name__ == '__main__':
|
|
68
|
+
httpd = make_server('127.0.0.1', 8000)
|
|
69
|
+
print(f'about to server HTTP on {httpd.server_address}')
|
|
70
|
+
httpd.serve_forever()
|