xmlpydict 0.0.8__tar.gz → 0.0.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xmlpydict-0.0.8/src/xmlpydict.egg-info → xmlpydict-0.0.12}/PKG-INFO +14 -19
- {xmlpydict-0.0.8 → xmlpydict-0.0.12}/README.md +8 -14
- {xmlpydict-0.0.8 → xmlpydict-0.0.12}/pyproject.toml +4 -4
- {xmlpydict-0.0.8 → xmlpydict-0.0.12}/setup.py +2 -2
- xmlpydict-0.0.12/src/xmlparse.cpp +222 -0
- {xmlpydict-0.0.8 → xmlpydict-0.0.12}/tests/test_parse.py +76 -22
- xmlpydict-0.0.12/xmlpydict/__init__.py +75 -0
- {xmlpydict-0.0.8 → xmlpydict-0.0.12/xmlpydict.egg-info}/PKG-INFO +14 -19
- xmlpydict-0.0.12/xmlpydict.egg-info/SOURCES.txt +13 -0
- {xmlpydict-0.0.8/src → xmlpydict-0.0.12}/xmlpydict.egg-info/requires.txt +1 -1
- xmlpydict-0.0.12/xmlpydict.egg-info/top_level.txt +2 -0
- xmlpydict-0.0.8/src/xmlparse.cpp +0 -413
- xmlpydict-0.0.8/src/xmlparse.py +0 -68
- xmlpydict-0.0.8/src/xmlpydict.egg-info/SOURCES.txt +0 -14
- xmlpydict-0.0.8/src/xmlpydict.egg-info/top_level.txt +0 -2
- xmlpydict-0.0.8/tests/test.py +0 -24
- {xmlpydict-0.0.8 → xmlpydict-0.0.12}/LICENSE +0 -0
- {xmlpydict-0.0.8 → xmlpydict-0.0.12}/MANIFEST.in +0 -0
- {xmlpydict-0.0.8 → xmlpydict-0.0.12}/setup.cfg +0 -0
- {xmlpydict-0.0.8/src → xmlpydict-0.0.12}/xmlpydict.egg-info/dependency_links.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: xmlpydict
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.12
|
|
4
4
|
Summary: xml to dictionary tool for python
|
|
5
5
|
Author-email: Matthew Taylor <matthew.taylor.andre@gmail.com>
|
|
6
6
|
Project-URL: Homepage, https://github.com/MatthewAndreTaylor/xml-to-pydict
|
|
@@ -10,29 +10,30 @@ Classifier: Intended Audience :: Developers
|
|
|
10
10
|
Classifier: License :: OSI Approved :: MIT License
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.7
|
|
14
13
|
Classifier: Programming Language :: Python :: 3.8
|
|
15
14
|
Classifier: Programming Language :: Python :: 3.9
|
|
16
15
|
Classifier: Programming Language :: Python :: 3.10
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.11
|
|
18
17
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
19
18
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
-
|
|
19
|
+
Classifier: Topic :: Text Processing :: Markup :: XML
|
|
20
|
+
Requires-Python: >=3.8
|
|
21
21
|
Description-Content-Type: text/markdown
|
|
22
22
|
License-File: LICENSE
|
|
23
23
|
Provides-Extra: tests
|
|
24
24
|
Requires-Dist: pytest; extra == "tests"
|
|
25
|
-
Requires-Dist:
|
|
25
|
+
Requires-Dist: requests; extra == "tests"
|
|
26
|
+
Dynamic: license-file
|
|
26
27
|
|
|
27
28
|
# xmlpydict 📑
|
|
28
29
|
|
|
29
30
|
[](https://github.com/MatthewAndreTaylor/xml-to-pydict/actions/workflows/tests.yml)
|
|
30
|
-
[](https://github.com/MatthewAndreTaylor/xml-to-pydict)
|
|
31
32
|
[](https://pypi.org/project/xmlpydict/)
|
|
32
33
|
|
|
33
34
|
## Requirements
|
|
34
35
|
|
|
35
|
-
- `python 3.
|
|
36
|
+
- `python 3.8+`
|
|
36
37
|
|
|
37
38
|
## Installation
|
|
38
39
|
|
|
@@ -54,13 +55,11 @@ pip install xmlpydict
|
|
|
54
55
|
|
|
55
56
|
## Goals
|
|
56
57
|
|
|
57
|
-
Create a consistent parsing strategy between
|
|
58
|
-
xmlpydict takes a more laid pack approack to enforcing the syntax of xml.
|
|
58
|
+
Create a consistent parsing strategy between XML and Python dictionaries using the specification found [here](https://www.xml.com/pub/a/2006/05/31/converting-between-xml-and-json.html). `xmlpydict` focuses on speed; see the benchmarks below.
|
|
59
59
|
|
|
60
|
-
|
|
60
|
+
<img width="256" alt="small_xml_document" src="https://github.com/user-attachments/assets/0248a408-6bb6-4790-bd0f-f90537e2f21a" />
|
|
61
|
+
<img width="256" alt="large_xml_document" src="https://github.com/user-attachments/assets/539a2a69-f475-46a5-bffc-1e8805a5a5e7" />
|
|
61
62
|
|
|
62
|
-
xmlpydict allows for multiple root elements.
|
|
63
|
-
The root object is treated as the python object.
|
|
64
63
|
|
|
65
64
|
### xmlpydict supports the following
|
|
66
65
|
|
|
@@ -72,19 +71,15 @@ The root object is treated as the python object.
|
|
|
72
71
|
|
|
73
72
|
[Characters](https://www.w3.org/TR/xml/#charsets): Similar to CDATA text is stored as {'#text': Char} , however this text is stripped.
|
|
74
73
|
|
|
75
|
-
### dict.get(key[, default]) will not cause exceptions
|
|
76
|
-
|
|
77
74
|
```py
|
|
78
75
|
# Empty tags are containers
|
|
79
76
|
>>> from xmlpydict import parse
|
|
80
77
|
>>> parse("<a></a>")
|
|
81
|
-
{'a':
|
|
78
|
+
{'a': None}
|
|
82
79
|
>>> parse("<a/>")
|
|
83
|
-
{'a':
|
|
80
|
+
{'a': None}
|
|
84
81
|
>>> parse("<a/>").get('href')
|
|
85
82
|
None
|
|
86
|
-
>>> parse("")
|
|
87
|
-
{}
|
|
88
83
|
```
|
|
89
84
|
|
|
90
85
|
### Attribute prefixing
|
|
@@ -103,7 +98,7 @@ None
|
|
|
103
98
|
# Grammar and structure of the xml_content is checked while parsing
|
|
104
99
|
>>> from xmlpydict import parse
|
|
105
100
|
>>> parse("<a></ a>")
|
|
106
|
-
|
|
101
|
+
xml.parsers.expat.ExpatError: not well-formed (invalid token): line 1, column 5
|
|
107
102
|
```
|
|
108
103
|
|
|
109
104
|
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
# xmlpydict 📑
|
|
2
2
|
|
|
3
3
|
[](https://github.com/MatthewAndreTaylor/xml-to-pydict/actions/workflows/tests.yml)
|
|
4
|
-
[](https://github.com/MatthewAndreTaylor/xml-to-pydict)
|
|
5
5
|
[](https://pypi.org/project/xmlpydict/)
|
|
6
6
|
|
|
7
7
|
## Requirements
|
|
8
8
|
|
|
9
|
-
- `python 3.
|
|
9
|
+
- `python 3.8+`
|
|
10
10
|
|
|
11
11
|
## Installation
|
|
12
12
|
|
|
@@ -28,13 +28,11 @@ pip install xmlpydict
|
|
|
28
28
|
|
|
29
29
|
## Goals
|
|
30
30
|
|
|
31
|
-
Create a consistent parsing strategy between
|
|
32
|
-
xmlpydict takes a more laid pack approack to enforcing the syntax of xml.
|
|
31
|
+
Create a consistent parsing strategy between XML and Python dictionaries using the specification found [here](https://www.xml.com/pub/a/2006/05/31/converting-between-xml-and-json.html). `xmlpydict` focuses on speed; see the benchmarks below.
|
|
33
32
|
|
|
34
|
-
|
|
33
|
+
<img width="256" alt="small_xml_document" src="https://github.com/user-attachments/assets/0248a408-6bb6-4790-bd0f-f90537e2f21a" />
|
|
34
|
+
<img width="256" alt="large_xml_document" src="https://github.com/user-attachments/assets/539a2a69-f475-46a5-bffc-1e8805a5a5e7" />
|
|
35
35
|
|
|
36
|
-
xmlpydict allows for multiple root elements.
|
|
37
|
-
The root object is treated as the python object.
|
|
38
36
|
|
|
39
37
|
### xmlpydict supports the following
|
|
40
38
|
|
|
@@ -46,19 +44,15 @@ The root object is treated as the python object.
|
|
|
46
44
|
|
|
47
45
|
[Characters](https://www.w3.org/TR/xml/#charsets): Similar to CDATA text is stored as {'#text': Char} , however this text is stripped.
|
|
48
46
|
|
|
49
|
-
### dict.get(key[, default]) will not cause exceptions
|
|
50
|
-
|
|
51
47
|
```py
|
|
52
48
|
# Empty tags are containers
|
|
53
49
|
>>> from xmlpydict import parse
|
|
54
50
|
>>> parse("<a></a>")
|
|
55
|
-
{'a':
|
|
51
|
+
{'a': None}
|
|
56
52
|
>>> parse("<a/>")
|
|
57
|
-
{'a':
|
|
53
|
+
{'a': None}
|
|
58
54
|
>>> parse("<a/>").get('href')
|
|
59
55
|
None
|
|
60
|
-
>>> parse("")
|
|
61
|
-
{}
|
|
62
56
|
```
|
|
63
57
|
|
|
64
58
|
### Attribute prefixing
|
|
@@ -77,7 +71,7 @@ None
|
|
|
77
71
|
# Grammar and structure of the xml_content is checked while parsing
|
|
78
72
|
>>> from xmlpydict import parse
|
|
79
73
|
>>> parse("<a></ a>")
|
|
80
|
-
|
|
74
|
+
xml.parsers.expat.ExpatError: not well-formed (invalid token): line 1, column 5
|
|
81
75
|
```
|
|
82
76
|
|
|
83
77
|
|
|
@@ -4,13 +4,13 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "xmlpydict"
|
|
7
|
-
version = "0.0.
|
|
7
|
+
version = "0.0.12"
|
|
8
8
|
description="xml to dictionary tool for python"
|
|
9
9
|
authors = [
|
|
10
10
|
{name = "Matthew Taylor", email = "matthew.taylor.andre@gmail.com"},
|
|
11
11
|
]
|
|
12
12
|
urls = {Homepage = "https://github.com/MatthewAndreTaylor/xml-to-pydict"}
|
|
13
|
-
requires-python = ">=3.
|
|
13
|
+
requires-python = ">=3.8"
|
|
14
14
|
keywords = [ "xml", "dictionary" ]
|
|
15
15
|
classifiers = [
|
|
16
16
|
"Development Status :: 3 - Alpha",
|
|
@@ -18,13 +18,13 @@ classifiers = [
|
|
|
18
18
|
"License :: OSI Approved :: MIT License",
|
|
19
19
|
"Programming Language :: Python :: 3",
|
|
20
20
|
"Programming Language :: Python :: 3 :: Only",
|
|
21
|
-
"Programming Language :: Python :: 3.7",
|
|
22
21
|
"Programming Language :: Python :: 3.8",
|
|
23
22
|
"Programming Language :: Python :: 3.9",
|
|
24
23
|
"Programming Language :: Python :: 3.10",
|
|
25
24
|
"Programming Language :: Python :: 3.11",
|
|
26
25
|
"Programming Language :: Python :: Implementation :: CPython",
|
|
27
26
|
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
27
|
+
"Topic :: Text Processing :: Markup :: XML",
|
|
28
28
|
]
|
|
29
29
|
|
|
30
30
|
[project.readme]
|
|
@@ -32,4 +32,4 @@ file = "README.md"
|
|
|
32
32
|
content-type = "text/markdown"
|
|
33
33
|
|
|
34
34
|
[project.optional-dependencies]
|
|
35
|
-
tests = [ "pytest", "
|
|
35
|
+
tests = [ "pytest", "requests" ]
|
|
@@ -16,8 +16,8 @@ class build_ext(build_ext_orig):
|
|
|
16
16
|
setup(
|
|
17
17
|
include_package_data=True,
|
|
18
18
|
ext_modules=[
|
|
19
|
-
Extension("
|
|
19
|
+
Extension("pyxmlhandler", ["src/xmlparse.cpp"]),
|
|
20
20
|
],
|
|
21
21
|
cmdclass={"build_ext": build_ext},
|
|
22
|
-
|
|
22
|
+
packages=["xmlpydict"],
|
|
23
23
|
)
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) 2023 Matthew Andre Taylor
|
|
3
|
+
*/
|
|
4
|
+
#include <Python.h>
|
|
5
|
+
#include <stdio.h>
|
|
6
|
+
#include <cctype>
|
|
7
|
+
#include <vector>
|
|
8
|
+
|
|
9
|
+
static PyObject* strip(PyObject* s_obj) {
|
|
10
|
+
Py_ssize_t start = 0;
|
|
11
|
+
Py_ssize_t end = PyUnicode_GetLength(s_obj);
|
|
12
|
+
while (start < end && std::isspace(PyUnicode_ReadChar(s_obj, start))) {
|
|
13
|
+
++start;
|
|
14
|
+
}
|
|
15
|
+
while (end > start && std::isspace(PyUnicode_ReadChar(s_obj, end - 1))) {
|
|
16
|
+
--end;
|
|
17
|
+
}
|
|
18
|
+
return PyUnicode_Substring(s_obj, start, end);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
typedef struct {
|
|
22
|
+
PyObject_HEAD PyObject* item; // current dict
|
|
23
|
+
PyObject* data; // character data buffer
|
|
24
|
+
std::vector<PyObject*> item_stack;
|
|
25
|
+
std::vector<PyObject*> data_stack;
|
|
26
|
+
PyObject* attr_prefix;
|
|
27
|
+
PyObject* cdata_key;
|
|
28
|
+
} PyDictHandler;
|
|
29
|
+
|
|
30
|
+
static PyObject* PyDictHandler_new(PyTypeObject* type, PyObject* args,
|
|
31
|
+
PyObject* kwargs) {
|
|
32
|
+
PyDictHandler* self;
|
|
33
|
+
self = (PyDictHandler*)type->tp_alloc(type, 0);
|
|
34
|
+
return (PyObject*)self;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
static int PyDictHandler_init(PyDictHandler* self, PyObject* args,
|
|
38
|
+
PyObject* kwargs) {
|
|
39
|
+
const char* attr_prefix = "@";
|
|
40
|
+
const char* cdata_key = "#text";
|
|
41
|
+
static char* kwlist[] = {"attr_prefix", "cdata_key", NULL};
|
|
42
|
+
|
|
43
|
+
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss", kwlist,
|
|
44
|
+
&attr_prefix, &cdata_key))
|
|
45
|
+
return -1;
|
|
46
|
+
|
|
47
|
+
self->item = Py_None;
|
|
48
|
+
self->data = PyUnicode_New(0, 127); // empty string
|
|
49
|
+
self->attr_prefix = PyUnicode_FromString(attr_prefix);
|
|
50
|
+
self->cdata_key = PyUnicode_FromString(cdata_key);
|
|
51
|
+
return 0;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
static PyObject* characters(PyDictHandler* self, PyObject* data_obj) {
|
|
55
|
+
PyUnicode_Append(&self->data, data_obj);
|
|
56
|
+
Py_RETURN_NONE;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
static PyObject* startElement(PyDictHandler* self, PyObject* args) {
|
|
60
|
+
self->item_stack.push_back(self->item);
|
|
61
|
+
self->data_stack.push_back(self->data);
|
|
62
|
+
self->data = PyUnicode_New(0, 127); // reset data buffer
|
|
63
|
+
|
|
64
|
+
const char* name;
|
|
65
|
+
PyObject* attrs;
|
|
66
|
+
if (!PyArg_ParseTuple(args, "sO", &name, &attrs)) {
|
|
67
|
+
return NULL;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
if (!PyDict_Check(attrs) || PyDict_Size(attrs) == 0) {
|
|
71
|
+
self->item = Py_None;
|
|
72
|
+
Py_RETURN_NONE;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
PyObject* newDict = PyDict_New();
|
|
76
|
+
PyObject *key, *value;
|
|
77
|
+
Py_ssize_t pos = 0;
|
|
78
|
+
|
|
79
|
+
while (PyDict_Next(attrs, &pos, &key, &value)) {
|
|
80
|
+
PyObject* prefixed_key = PyUnicode_Concat(self->attr_prefix, key);
|
|
81
|
+
PyDict_SetItem(newDict, prefixed_key, value);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
self->item = newDict;
|
|
85
|
+
Py_RETURN_NONE;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
static PyObject* updateChildren(PyObject*& target, PyObject* key, PyObject* value) {
|
|
89
|
+
if (target == Py_None) {
|
|
90
|
+
target = PyDict_New();
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
if (!PyDict_Contains(target, key)) {
|
|
94
|
+
PyDict_SetItem(target, key, value);
|
|
95
|
+
} else {
|
|
96
|
+
PyObject* existing = PyDict_GetItem(target, key);
|
|
97
|
+
if (PyList_Check(existing)) {
|
|
98
|
+
PyList_Append(existing, value);
|
|
99
|
+
} else {
|
|
100
|
+
PyObject* newList = PyList_New(2);
|
|
101
|
+
PyList_SetItem(newList, 0, existing);
|
|
102
|
+
PyList_SetItem(newList, 1, value);
|
|
103
|
+
PyDict_SetItem(target, key, newList);
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
return target;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
static PyObject* endElement(PyDictHandler* self, PyObject* name_obj) {
|
|
110
|
+
if (!self->data_stack.empty()) {
|
|
111
|
+
PyObject* temp_data = strip(self->data);
|
|
112
|
+
bool has_data = (PyUnicode_GetLength(temp_data) > 0);
|
|
113
|
+
PyObject* py_data = has_data ? temp_data : Py_None;
|
|
114
|
+
PyObject* temp_item = self->item;
|
|
115
|
+
|
|
116
|
+
self->item = self->item_stack.back();
|
|
117
|
+
self->data = self->data_stack.back();
|
|
118
|
+
self->item_stack.pop_back();
|
|
119
|
+
self->data_stack.pop_back();
|
|
120
|
+
|
|
121
|
+
if (temp_item != Py_None) {
|
|
122
|
+
if (has_data) {
|
|
123
|
+
PyDict_SetItem(temp_item, self->cdata_key, py_data);
|
|
124
|
+
}
|
|
125
|
+
temp_item = PyDict_Copy(temp_item);
|
|
126
|
+
self->item = updateChildren(self->item, name_obj, temp_item);
|
|
127
|
+
}
|
|
128
|
+
else {
|
|
129
|
+
self->item = updateChildren(self->item, name_obj, py_data);
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
Py_RETURN_NONE;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
static PyMethodDef PyDictHandler_methods[] = {
|
|
138
|
+
{"characters", (PyCFunction)characters, METH_O, "Handle character data"},
|
|
139
|
+
{"startElement", (PyCFunction)startElement, METH_VARARGS, "Handle start of an element"},
|
|
140
|
+
{"endElement", (PyCFunction)endElement, METH_O, "Handle end of an element"},
|
|
141
|
+
{NULL, NULL, 0, NULL}
|
|
142
|
+
};
|
|
143
|
+
|
|
144
|
+
static PyObject* PyDictHandler_get_item(PyDictHandler *self, void *closure)
|
|
145
|
+
{
|
|
146
|
+
Py_INCREF(self->item);
|
|
147
|
+
return self->item;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
static PyGetSetDef PyDictHandler_getset[] = {
|
|
151
|
+
{
|
|
152
|
+
"item", /* name */
|
|
153
|
+
(getter)PyDictHandler_get_item, /* get */
|
|
154
|
+
NULL, /* set */
|
|
155
|
+
NULL, /* doc */
|
|
156
|
+
NULL /* closure */
|
|
157
|
+
},
|
|
158
|
+
{NULL} /* Sentinel */
|
|
159
|
+
};
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
static PyTypeObject PyDictHandlerType = {
|
|
163
|
+
PyVarObject_HEAD_INIT(NULL, 0) "pyxmlhandler._PyDictHandler", // tp_name
|
|
164
|
+
sizeof(PyDictHandler), // tp_basicsize
|
|
165
|
+
0, // tp_itemsize
|
|
166
|
+
0, // tp_dealloc
|
|
167
|
+
0, // tp_vectorcall_offset
|
|
168
|
+
0, // tp_getattr
|
|
169
|
+
0, // tp_setattr
|
|
170
|
+
0, // tp_as_async
|
|
171
|
+
0, // tp_repr
|
|
172
|
+
0, // tp_as_number
|
|
173
|
+
0, // tp_as_sequence
|
|
174
|
+
0, // tp_as_mapping
|
|
175
|
+
0, // tp_hash
|
|
176
|
+
0, // tp_call
|
|
177
|
+
0, // tp_str
|
|
178
|
+
0, // tp_getattro
|
|
179
|
+
0, // tp_setattro
|
|
180
|
+
0, // tp_as_buffer
|
|
181
|
+
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, // tp_flags
|
|
182
|
+
"Handler that converts XML to Python dict", // tp_doc
|
|
183
|
+
0, // tp_traverse
|
|
184
|
+
0, // tp_clear
|
|
185
|
+
0, // tp_richcompare
|
|
186
|
+
0, // tp_weaklistoffset
|
|
187
|
+
0, // tp_iter
|
|
188
|
+
0, // tp_iternext
|
|
189
|
+
PyDictHandler_methods, // tp_methods
|
|
190
|
+
0, // tp_members
|
|
191
|
+
PyDictHandler_getset, // tp_getset
|
|
192
|
+
0, // tp_base
|
|
193
|
+
0, // tp_dict
|
|
194
|
+
0, // tp_descr_get
|
|
195
|
+
0, // tp_descr_set
|
|
196
|
+
0, // tp_dictoffset
|
|
197
|
+
(initproc)PyDictHandler_init, // tp_init
|
|
198
|
+
0, // tp_alloc
|
|
199
|
+
PyDictHandler_new, // tp_new
|
|
200
|
+
};
|
|
201
|
+
|
|
202
|
+
static PyModuleDef pyxmlhandlermodule = {
|
|
203
|
+
PyModuleDef_HEAD_INIT,
|
|
204
|
+
"pyxmlhandler",
|
|
205
|
+
"Module that provides XML to Python dict parsing",
|
|
206
|
+
-1,
|
|
207
|
+
NULL, NULL, NULL, NULL, NULL
|
|
208
|
+
};
|
|
209
|
+
|
|
210
|
+
PyMODINIT_FUNC PyInit_pyxmlhandler(void) {
|
|
211
|
+
PyObject* m;
|
|
212
|
+
if (PyType_Ready(&PyDictHandlerType) < 0)
|
|
213
|
+
return NULL;
|
|
214
|
+
|
|
215
|
+
m = PyModule_Create(&pyxmlhandlermodule);
|
|
216
|
+
if (m == NULL)
|
|
217
|
+
return NULL;
|
|
218
|
+
|
|
219
|
+
Py_INCREF(&PyDictHandlerType);
|
|
220
|
+
PyModule_AddObject(m, "_PyDictHandler", (PyObject*)&PyDictHandlerType);
|
|
221
|
+
return m;
|
|
222
|
+
}
|
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
import pytest
|
|
2
|
-
from xmlpydict import parse
|
|
3
2
|
import json
|
|
3
|
+
from xmlpydict import parse
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def test_simple():
|
|
7
|
-
assert parse("") == {}
|
|
8
|
-
assert parse("<p
|
|
9
|
-
assert parse("<p></p>") == {"p": {}}
|
|
7
|
+
assert parse("<p/>") == {"p": None}
|
|
8
|
+
assert parse("<p></p>") == {"p": None}
|
|
10
9
|
assert parse('<p width="10"></p>') == {"p": {"@width": "10"}}
|
|
10
|
+
assert parse("<p>Hello</p>") == {"p": "Hello"}
|
|
11
|
+
|
|
11
12
|
assert parse('<p width="10">Hello World</p>') == {
|
|
12
13
|
"p": {"@width": "10", "#text": "Hello World"}
|
|
13
14
|
}
|
|
@@ -21,7 +22,18 @@ def test_simple():
|
|
|
21
22
|
"p": {"@width": "10", "@height": "20"}
|
|
22
23
|
}
|
|
23
24
|
assert parse("<p>Hey <b>bold</b>There</p>") == {
|
|
24
|
-
"p": {"#text": "
|
|
25
|
+
"p": {"#text": "Hey There", "b": "bold"}
|
|
26
|
+
}
|
|
27
|
+
assert parse("<p>Hey <b>bold</b>There <b>bold</b>Buddy </p>") == {
|
|
28
|
+
"p": {"#text": "Hey There Buddy", "b": ["bold", "bold"]}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
assert parse("<p>Hey <b/>There Buddy</p>") == {
|
|
32
|
+
"p": {"#text": "Hey There Buddy", "b": None}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
assert parse("<p>Hey <b/>There Buddy <b/> </p>") == {
|
|
36
|
+
"p": {"#text": "Hey There Buddy", "b": [None, None]}
|
|
25
37
|
}
|
|
26
38
|
|
|
27
39
|
assert (
|
|
@@ -68,19 +80,16 @@ def test_simple():
|
|
|
68
80
|
|
|
69
81
|
def test_cdata():
|
|
70
82
|
assert parse("<content><![CDATA[<p>This is a paragraph</p>]]></content>") == {
|
|
71
|
-
"content":
|
|
83
|
+
"content": "<p>This is a paragraph</p>"
|
|
72
84
|
}
|
|
85
|
+
assert parse(
|
|
86
|
+
"<special_chars><![CDATA[$ ^ * % & <> () + - + ` ~]]></special_chars>"
|
|
87
|
+
) == {"special_chars": "$ ^ * % & <> () + - + ` ~"}
|
|
73
88
|
|
|
74
89
|
|
|
75
90
|
def test_nested():
|
|
76
|
-
assert parse("<book><p/></book> ") == {"book": {"p":
|
|
77
|
-
assert parse("<book><p></p></book>") == {"book": {"p":
|
|
78
|
-
assert parse("<book><p></p></book><card/>") == {"book": {"p": {}}, "card": {}}
|
|
79
|
-
assert parse("<pizza></pizza><book><p></p></book><card/>") == {
|
|
80
|
-
"pizza": {},
|
|
81
|
-
"book": {"p": {}},
|
|
82
|
-
"card": {},
|
|
83
|
-
}
|
|
91
|
+
assert parse("<book><p/></book> ") == {"book": {"p": None}}
|
|
92
|
+
assert parse("<book><p></p></book>") == {"book": {"p": None}}
|
|
84
93
|
|
|
85
94
|
|
|
86
95
|
def test_list():
|
|
@@ -95,12 +104,20 @@ def test_list():
|
|
|
95
104
|
|
|
96
105
|
|
|
97
106
|
def test_comment():
|
|
98
|
-
assert parse("
|
|
107
|
+
assert parse("<p/><!-- simple comment -->") == {"p": None}
|
|
99
108
|
comment = """<world>
|
|
100
109
|
<!-- $comment+++@python -->
|
|
101
110
|
<lake>Content</lake>
|
|
102
111
|
</world>"""
|
|
103
|
-
assert parse(comment) == {"world": {"lake":
|
|
112
|
+
assert parse(comment) == {"world": {"lake": "Content"}}
|
|
113
|
+
multiple_comments = """<book>
|
|
114
|
+
<!-- Comment 0 -->
|
|
115
|
+
<!-- Comment 1 -->
|
|
116
|
+
<lines>510</lines>
|
|
117
|
+
<!-- Comment 2 -->
|
|
118
|
+
<!-- -->
|
|
119
|
+
</book>"""
|
|
120
|
+
assert parse(multiple_comments) == {"book": {"lines": "510"}}
|
|
104
121
|
|
|
105
122
|
|
|
106
123
|
def test_files():
|
|
@@ -275,15 +292,17 @@ def test_files():
|
|
|
275
292
|
|
|
276
293
|
def test_exception():
|
|
277
294
|
xml_strings = [
|
|
278
|
-
"< p/>",
|
|
279
|
-
"<p>",
|
|
280
|
-
"<p/ >",
|
|
281
295
|
"<p height'10'/>",
|
|
282
296
|
"<p height='10'width='5'/>",
|
|
283
|
-
"<p width='5/>",
|
|
284
297
|
"<p width=5'/>",
|
|
285
|
-
"</p>",
|
|
286
298
|
"<pwidth='5'/>",
|
|
299
|
+
"<!---->",
|
|
300
|
+
"<a></p>",
|
|
301
|
+
"<></>",
|
|
302
|
+
"</>",
|
|
303
|
+
"<",
|
|
304
|
+
">",
|
|
305
|
+
"<nested></p></nested>",
|
|
287
306
|
]
|
|
288
307
|
for xml_str in xml_strings:
|
|
289
308
|
with pytest.raises(Exception):
|
|
@@ -291,8 +310,43 @@ def test_exception():
|
|
|
291
310
|
|
|
292
311
|
|
|
293
312
|
def test_prefix():
|
|
294
|
-
assert parse("<p></p>", attr_prefix="$") == {"p":
|
|
313
|
+
assert parse("<p></p>", attr_prefix="$") == {"p": None}
|
|
295
314
|
assert parse('<p width="10"></p>', attr_prefix="$") == {"p": {"$width": "10"}}
|
|
296
315
|
assert parse('<p width="10" height="5"></p>', attr_prefix="$") == {
|
|
297
316
|
"p": {"$width": "10", "$height": "5"}
|
|
298
317
|
}
|
|
318
|
+
assert parse('<p width="10" height="5"></p>', attr_prefix="$$$$$$$$$") == {
|
|
319
|
+
"p": {"$$$$$$$$$width": "10", "$$$$$$$$$height": "5"}
|
|
320
|
+
}
|
|
321
|
+
assert parse('<p width="10" height="5"></p>', attr_prefix="") == {
|
|
322
|
+
"p": {"width": "10", "height": "5"}
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def test_document():
|
|
327
|
+
s = """<?xml version="1.0" encoding="UTF-8"?><repository>
|
|
328
|
+
<project pypi="xmlpydict">
|
|
329
|
+
<title>XML document parser</title>
|
|
330
|
+
<author>Matthew Taylor</author>
|
|
331
|
+
</project>
|
|
332
|
+
<project pypi="blank">
|
|
333
|
+
<title>Test project</title>
|
|
334
|
+
<author>Matthew Taylor</author>
|
|
335
|
+
</project>
|
|
336
|
+
</repository>"""
|
|
337
|
+
assert parse(s) == {
|
|
338
|
+
"repository": {
|
|
339
|
+
"project": [
|
|
340
|
+
{
|
|
341
|
+
"@pypi": "xmlpydict",
|
|
342
|
+
"title": "XML document parser",
|
|
343
|
+
"author": "Matthew Taylor",
|
|
344
|
+
},
|
|
345
|
+
{
|
|
346
|
+
"@pypi": "blank",
|
|
347
|
+
"title": "Test project",
|
|
348
|
+
"author": "Matthew Taylor",
|
|
349
|
+
},
|
|
350
|
+
]
|
|
351
|
+
}
|
|
352
|
+
}
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
from pyxmlhandler import _PyDictHandler
|
|
2
|
+
from xml.parsers import expat
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def parse(xml_content, attr_prefix: str = "@", cdata_key: str = "#text") -> dict:
|
|
6
|
+
"""
|
|
7
|
+
Parse XML content into a python dictionary.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
xml_content: The XML content to be parsed.
|
|
11
|
+
attr_prefix: The prefix to use for attributes in the resulting dictionary.
|
|
12
|
+
cdata_key: The key to use for character data in the resulting dictionary.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
A dictionary representation of the XML content.
|
|
16
|
+
"""
|
|
17
|
+
handler = _PyDictHandler(attr_prefix=attr_prefix, cdata_key=cdata_key)
|
|
18
|
+
parser = expat.ParserCreate()
|
|
19
|
+
parser.CharacterDataHandler = handler.characters
|
|
20
|
+
parser.StartElementHandler = handler.startElement
|
|
21
|
+
parser.EndElementHandler = handler.endElement
|
|
22
|
+
parser.Parse(xml_content, True)
|
|
23
|
+
return handler.item
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def parse_file(file_path, attr_prefix: str = "@", cdata_key: str = "#text") -> dict:
|
|
27
|
+
"""
|
|
28
|
+
Parse an XML file into a python dictionary.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
file_path: The path to the XML file to be parsed.
|
|
32
|
+
attr_prefix: The prefix to use for attributes in the resulting dictionary.
|
|
33
|
+
cdata_key: The key to use for character data in the resulting dictionary.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
A dictionary representation of the XML file content.
|
|
37
|
+
"""
|
|
38
|
+
handler = _PyDictHandler(attr_prefix=attr_prefix, cdata_key=cdata_key)
|
|
39
|
+
parser = expat.ParserCreate()
|
|
40
|
+
parser.CharacterDataHandler = handler.characters
|
|
41
|
+
parser.StartElementHandler = handler.startElement
|
|
42
|
+
parser.EndElementHandler = handler.endElement
|
|
43
|
+
with open(file_path, "rb") as f:
|
|
44
|
+
parser.ParseFile(f)
|
|
45
|
+
return handler.item
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def iter_xml_documents(file_path, chunk_size=64 * 1024):
|
|
50
|
+
start_token = b"<?xml"
|
|
51
|
+
buffer = b""
|
|
52
|
+
with open(file_path, "rb") as f:
|
|
53
|
+
while True:
|
|
54
|
+
chunk = f.read(chunk_size)
|
|
55
|
+
if not chunk:
|
|
56
|
+
if buffer.strip():
|
|
57
|
+
yield buffer
|
|
58
|
+
break
|
|
59
|
+
buffer += chunk
|
|
60
|
+
while True:
|
|
61
|
+
start_index = buffer.find(start_token, 1)
|
|
62
|
+
if start_index == -1:
|
|
63
|
+
break
|
|
64
|
+
yield buffer[:start_index]
|
|
65
|
+
buffer = buffer[start_index:]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def parse_xml_collections(file_path, attr_prefix: str = "@", cdata_key: str = "#text"):
|
|
70
|
+
for xml_content in iter_xml_documents(file_path):
|
|
71
|
+
yield parse(
|
|
72
|
+
xml_content.decode("utf-8"),
|
|
73
|
+
attr_prefix=attr_prefix,
|
|
74
|
+
cdata_key=cdata_key
|
|
75
|
+
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: xmlpydict
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.12
|
|
4
4
|
Summary: xml to dictionary tool for python
|
|
5
5
|
Author-email: Matthew Taylor <matthew.taylor.andre@gmail.com>
|
|
6
6
|
Project-URL: Homepage, https://github.com/MatthewAndreTaylor/xml-to-pydict
|
|
@@ -10,29 +10,30 @@ Classifier: Intended Audience :: Developers
|
|
|
10
10
|
Classifier: License :: OSI Approved :: MIT License
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.7
|
|
14
13
|
Classifier: Programming Language :: Python :: 3.8
|
|
15
14
|
Classifier: Programming Language :: Python :: 3.9
|
|
16
15
|
Classifier: Programming Language :: Python :: 3.10
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.11
|
|
18
17
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
19
18
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
-
|
|
19
|
+
Classifier: Topic :: Text Processing :: Markup :: XML
|
|
20
|
+
Requires-Python: >=3.8
|
|
21
21
|
Description-Content-Type: text/markdown
|
|
22
22
|
License-File: LICENSE
|
|
23
23
|
Provides-Extra: tests
|
|
24
24
|
Requires-Dist: pytest; extra == "tests"
|
|
25
|
-
Requires-Dist:
|
|
25
|
+
Requires-Dist: requests; extra == "tests"
|
|
26
|
+
Dynamic: license-file
|
|
26
27
|
|
|
27
28
|
# xmlpydict 📑
|
|
28
29
|
|
|
29
30
|
[](https://github.com/MatthewAndreTaylor/xml-to-pydict/actions/workflows/tests.yml)
|
|
30
|
-
[](https://github.com/MatthewAndreTaylor/xml-to-pydict)
|
|
31
32
|
[](https://pypi.org/project/xmlpydict/)
|
|
32
33
|
|
|
33
34
|
## Requirements
|
|
34
35
|
|
|
35
|
-
- `python 3.
|
|
36
|
+
- `python 3.8+`
|
|
36
37
|
|
|
37
38
|
## Installation
|
|
38
39
|
|
|
@@ -54,13 +55,11 @@ pip install xmlpydict
|
|
|
54
55
|
|
|
55
56
|
## Goals
|
|
56
57
|
|
|
57
|
-
Create a consistent parsing strategy between
|
|
58
|
-
xmlpydict takes a more laid pack approack to enforcing the syntax of xml.
|
|
58
|
+
Create a consistent parsing strategy between XML and Python dictionaries using the specification found [here](https://www.xml.com/pub/a/2006/05/31/converting-between-xml-and-json.html). `xmlpydict` focuses on speed; see the benchmarks below.
|
|
59
59
|
|
|
60
|
-
|
|
60
|
+
<img width="256" alt="small_xml_document" src="https://github.com/user-attachments/assets/0248a408-6bb6-4790-bd0f-f90537e2f21a" />
|
|
61
|
+
<img width="256" alt="large_xml_document" src="https://github.com/user-attachments/assets/539a2a69-f475-46a5-bffc-1e8805a5a5e7" />
|
|
61
62
|
|
|
62
|
-
xmlpydict allows for multiple root elements.
|
|
63
|
-
The root object is treated as the python object.
|
|
64
63
|
|
|
65
64
|
### xmlpydict supports the following
|
|
66
65
|
|
|
@@ -72,19 +71,15 @@ The root object is treated as the python object.
|
|
|
72
71
|
|
|
73
72
|
[Characters](https://www.w3.org/TR/xml/#charsets): Similar to CDATA text is stored as {'#text': Char} , however this text is stripped.
|
|
74
73
|
|
|
75
|
-
### dict.get(key[, default]) will not cause exceptions
|
|
76
|
-
|
|
77
74
|
```py
|
|
78
75
|
# Empty tags are containers
|
|
79
76
|
>>> from xmlpydict import parse
|
|
80
77
|
>>> parse("<a></a>")
|
|
81
|
-
{'a':
|
|
78
|
+
{'a': None}
|
|
82
79
|
>>> parse("<a/>")
|
|
83
|
-
{'a':
|
|
80
|
+
{'a': None}
|
|
84
81
|
>>> parse("<a/>").get('href')
|
|
85
82
|
None
|
|
86
|
-
>>> parse("")
|
|
87
|
-
{}
|
|
88
83
|
```
|
|
89
84
|
|
|
90
85
|
### Attribute prefixing
|
|
@@ -103,7 +98,7 @@ None
|
|
|
103
98
|
# Grammar and structure of the xml_content is checked while parsing
|
|
104
99
|
>>> from xmlpydict import parse
|
|
105
100
|
>>> parse("<a></ a>")
|
|
106
|
-
|
|
101
|
+
xml.parsers.expat.ExpatError: not well-formed (invalid token): line 1, column 5
|
|
107
102
|
```
|
|
108
103
|
|
|
109
104
|
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
MANIFEST.in
|
|
3
|
+
README.md
|
|
4
|
+
pyproject.toml
|
|
5
|
+
setup.py
|
|
6
|
+
src/xmlparse.cpp
|
|
7
|
+
tests/test_parse.py
|
|
8
|
+
xmlpydict/__init__.py
|
|
9
|
+
xmlpydict.egg-info/PKG-INFO
|
|
10
|
+
xmlpydict.egg-info/SOURCES.txt
|
|
11
|
+
xmlpydict.egg-info/dependency_links.txt
|
|
12
|
+
xmlpydict.egg-info/requires.txt
|
|
13
|
+
xmlpydict.egg-info/top_level.txt
|
xmlpydict-0.0.8/src/xmlparse.cpp
DELETED
|
@@ -1,413 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Copyright (c) 2023 Matthew Andre Taylor
|
|
3
|
-
*/
|
|
4
|
-
#include <Python.h>
|
|
5
|
-
#include <string>
|
|
6
|
-
#include <vector>
|
|
7
|
-
|
|
8
|
-
typedef enum {
|
|
9
|
-
PRIMITIVE,
|
|
10
|
-
CONTAINER_OPEN,
|
|
11
|
-
CONTAINER_CLOSE,
|
|
12
|
-
TEXT,
|
|
13
|
-
COMMENT
|
|
14
|
-
} NodeType;
|
|
15
|
-
|
|
16
|
-
typedef struct {
|
|
17
|
-
std::string key;
|
|
18
|
-
std::string value;
|
|
19
|
-
} Pair;
|
|
20
|
-
|
|
21
|
-
typedef struct {
|
|
22
|
-
NodeType type;
|
|
23
|
-
std::string elementName;
|
|
24
|
-
std::vector<Pair> attr;
|
|
25
|
-
} XMLNode;
|
|
26
|
-
|
|
27
|
-
size_t i;
|
|
28
|
-
|
|
29
|
-
static void parseContainerClose(XMLNode *node, const char *xmlContent) {
|
|
30
|
-
node->type = CONTAINER_CLOSE;
|
|
31
|
-
i++;
|
|
32
|
-
if (std::isalpha(xmlContent[i]) || xmlContent[i] == '_' ||
|
|
33
|
-
xmlContent[i] == ':') {
|
|
34
|
-
node->elementName.push_back(xmlContent[i]);
|
|
35
|
-
i++;
|
|
36
|
-
} else {
|
|
37
|
-
PyErr_Format(PyExc_Exception, "not well formed (violation at pos=%d)", i);
|
|
38
|
-
return;
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
while (xmlContent[i] != '\0' && xmlContent[i] != '>') {
|
|
42
|
-
if (std::isalnum(xmlContent[i]) || xmlContent[i] == '_' ||
|
|
43
|
-
xmlContent[i] == ':' || xmlContent[i] == '-' || xmlContent[i] == '.') {
|
|
44
|
-
node->elementName.push_back(xmlContent[i]);
|
|
45
|
-
} else if (std::isspace(xmlContent[i])) {
|
|
46
|
-
if (node->elementName.empty()) {
|
|
47
|
-
PyErr_Format(PyExc_Exception, "not well formed (violation at pos=%d)",
|
|
48
|
-
i);
|
|
49
|
-
return;
|
|
50
|
-
}
|
|
51
|
-
} else {
|
|
52
|
-
PyErr_Format(PyExc_Exception, "not well formed (violation at pos=%d)", i);
|
|
53
|
-
return;
|
|
54
|
-
}
|
|
55
|
-
i++;
|
|
56
|
-
}
|
|
57
|
-
i++;
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
static void parseContainerOpen(XMLNode *node, const char *xmlContent) {
|
|
61
|
-
node->type = CONTAINER_OPEN;
|
|
62
|
-
|
|
63
|
-
if (std::isalpha(xmlContent[i]) || xmlContent[i] == '_' ||
|
|
64
|
-
xmlContent[i] == ':') {
|
|
65
|
-
node->elementName.push_back(xmlContent[i]);
|
|
66
|
-
i++;
|
|
67
|
-
} else {
|
|
68
|
-
PyErr_Format(PyExc_Exception, "not well formed (violation at pos=%d)", i);
|
|
69
|
-
return;
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
bool hasAttr = false;
|
|
73
|
-
|
|
74
|
-
// Parse name
|
|
75
|
-
while (xmlContent[i] != '\0' && xmlContent[i] != '>') {
|
|
76
|
-
if (xmlContent[i] == '/' && xmlContent[i + 1] == '>') {
|
|
77
|
-
node->type = PRIMITIVE;
|
|
78
|
-
i += 2;
|
|
79
|
-
return;
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
if (std::isalnum(xmlContent[i]) || xmlContent[i] == '_' ||
|
|
83
|
-
xmlContent[i] == ':' || xmlContent[i] == '-' || xmlContent[i] == '.') {
|
|
84
|
-
node->elementName.push_back(xmlContent[i]);
|
|
85
|
-
i++;
|
|
86
|
-
} else if (std::isspace(xmlContent[i])) {
|
|
87
|
-
if (node->elementName.empty()) {
|
|
88
|
-
PyErr_Format(PyExc_Exception, "not well formed (violation at pos=%d)",
|
|
89
|
-
i);
|
|
90
|
-
return;
|
|
91
|
-
}
|
|
92
|
-
hasAttr = true;
|
|
93
|
-
break;
|
|
94
|
-
} else {
|
|
95
|
-
PyErr_Format(PyExc_Exception, "not well formed (violation at pos=%d)", i);
|
|
96
|
-
return;
|
|
97
|
-
}
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
// 0: space, 1: start, 2: name, 3: equals, 4: quote, 5: value
|
|
101
|
-
char state = 0;
|
|
102
|
-
|
|
103
|
-
if (hasAttr) {
|
|
104
|
-
std::string key;
|
|
105
|
-
std::string val;
|
|
106
|
-
char quoteType = 0;
|
|
107
|
-
|
|
108
|
-
while (xmlContent[i] != '\0' && xmlContent[i] != '>') {
|
|
109
|
-
switch (state) {
|
|
110
|
-
case 0:
|
|
111
|
-
if (xmlContent[i] == '/' && xmlContent[i + 1] == '>') {
|
|
112
|
-
node->type = PRIMITIVE;
|
|
113
|
-
i += 2;
|
|
114
|
-
return;
|
|
115
|
-
}
|
|
116
|
-
if (std::isspace(xmlContent[i])) {
|
|
117
|
-
i++;
|
|
118
|
-
state = 1;
|
|
119
|
-
} else {
|
|
120
|
-
PyErr_Format(PyExc_Exception, "not well formed (violation at pos=%d)",
|
|
121
|
-
i);
|
|
122
|
-
return;
|
|
123
|
-
}
|
|
124
|
-
break;
|
|
125
|
-
case 1:
|
|
126
|
-
if (xmlContent[i] == '/' && xmlContent[i + 1] == '>') {
|
|
127
|
-
node->type = PRIMITIVE;
|
|
128
|
-
i += 2;
|
|
129
|
-
return;
|
|
130
|
-
}
|
|
131
|
-
if (std::isspace(xmlContent[i])) {
|
|
132
|
-
i++;
|
|
133
|
-
} else if (std::isalpha(xmlContent[i]) || xmlContent[i] == '_' ||
|
|
134
|
-
xmlContent[i] == ':') {
|
|
135
|
-
state = 2;
|
|
136
|
-
key.push_back(xmlContent[i]);
|
|
137
|
-
i++;
|
|
138
|
-
} else {
|
|
139
|
-
PyErr_Format(PyExc_Exception, "not well formed (violation at pos=%d)",
|
|
140
|
-
i);
|
|
141
|
-
return;
|
|
142
|
-
}
|
|
143
|
-
break;
|
|
144
|
-
case 2:
|
|
145
|
-
if (xmlContent[i] == '=') {
|
|
146
|
-
state = 4;
|
|
147
|
-
} else if (std::isalnum(xmlContent[i]) || xmlContent[i] == '_' ||
|
|
148
|
-
xmlContent[i] == ':' || xmlContent[i] == '-' ||
|
|
149
|
-
xmlContent[i] == '.') {
|
|
150
|
-
key.push_back(xmlContent[i]);
|
|
151
|
-
} else if (std::isspace(xmlContent[i])) {
|
|
152
|
-
state = 3;
|
|
153
|
-
} else {
|
|
154
|
-
PyErr_Format(PyExc_Exception, "not well formed (violation at pos=%d)",
|
|
155
|
-
i);
|
|
156
|
-
return;
|
|
157
|
-
}
|
|
158
|
-
i++;
|
|
159
|
-
break;
|
|
160
|
-
case 3:
|
|
161
|
-
if (xmlContent[i] == '=') {
|
|
162
|
-
state = 4;
|
|
163
|
-
} else if (!std::isspace(xmlContent[i])) {
|
|
164
|
-
PyErr_Format(PyExc_Exception, "not well formed (violation at pos=%d)",
|
|
165
|
-
i);
|
|
166
|
-
return;
|
|
167
|
-
}
|
|
168
|
-
i++;
|
|
169
|
-
break;
|
|
170
|
-
case 4:
|
|
171
|
-
if (xmlContent[i] == '\'' || xmlContent[i] == '\"') {
|
|
172
|
-
state = 5;
|
|
173
|
-
quoteType = xmlContent[i];
|
|
174
|
-
} else if (!std::isspace(xmlContent[i])) {
|
|
175
|
-
PyErr_Format(PyExc_Exception, "not well formed (violation at pos=%d)",
|
|
176
|
-
i);
|
|
177
|
-
return;
|
|
178
|
-
}
|
|
179
|
-
i++;
|
|
180
|
-
break;
|
|
181
|
-
default:
|
|
182
|
-
if (xmlContent[i] == quoteType) {
|
|
183
|
-
state = 0;
|
|
184
|
-
node->attr.push_back({key, val});
|
|
185
|
-
key.clear();
|
|
186
|
-
val.clear();
|
|
187
|
-
} else {
|
|
188
|
-
val.push_back(xmlContent[i]);
|
|
189
|
-
}
|
|
190
|
-
i++;
|
|
191
|
-
break;
|
|
192
|
-
}
|
|
193
|
-
}
|
|
194
|
-
}
|
|
195
|
-
if (state > 1) {
|
|
196
|
-
PyErr_Format(PyExc_Exception, "not well formed (violation at pos=%d)", i);
|
|
197
|
-
return;
|
|
198
|
-
}
|
|
199
|
-
i++;
|
|
200
|
-
}
|
|
201
|
-
|
|
202
|
-
static void parseComment(XMLNode *node, const char *xmlContent) {
|
|
203
|
-
node->type = COMMENT;
|
|
204
|
-
i++;
|
|
205
|
-
if (xmlContent[i] != '-' || xmlContent[i + 1] != '-') {
|
|
206
|
-
PyErr_Format(PyExc_Exception, "not well formed (violation at pos=%d)", i);
|
|
207
|
-
return;
|
|
208
|
-
}
|
|
209
|
-
i += 2;
|
|
210
|
-
|
|
211
|
-
while (xmlContent[i] != '\0' || xmlContent[i + 1] != '\0') {
|
|
212
|
-
if (xmlContent[i] == '-' && xmlContent[i + 1] == '-' &&
|
|
213
|
-
xmlContent[i + 2] == '>') {
|
|
214
|
-
// Found the end of the comment
|
|
215
|
-
if (xmlContent[i - 1] == '-') {
|
|
216
|
-
PyErr_Format(PyExc_Exception, "not well formed (violation at pos=%d)",
|
|
217
|
-
i - 1);
|
|
218
|
-
return;
|
|
219
|
-
}
|
|
220
|
-
i += 3;
|
|
221
|
-
return;
|
|
222
|
-
}
|
|
223
|
-
i++;
|
|
224
|
-
}
|
|
225
|
-
PyErr_SetString(PyExc_Exception, "unclosed token");
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
static void parseCData(XMLNode *node, const char *xmlContent) {
|
|
229
|
-
node->type = TEXT;
|
|
230
|
-
i+=2;
|
|
231
|
-
std::string cdata = "CDATA[";
|
|
232
|
-
size_t j = 0;
|
|
233
|
-
while (xmlContent[i] != '\0') {
|
|
234
|
-
if (j >= cdata.size()) {
|
|
235
|
-
break;
|
|
236
|
-
}
|
|
237
|
-
if (cdata[j] != xmlContent[i]) {
|
|
238
|
-
PyErr_Format(PyExc_Exception, "not well formed (violation at pos=%d)", i);
|
|
239
|
-
return;
|
|
240
|
-
}
|
|
241
|
-
i++;
|
|
242
|
-
j++;
|
|
243
|
-
}
|
|
244
|
-
while (xmlContent[i] != '\0' || xmlContent[i + 1] != '\0') {
|
|
245
|
-
if (xmlContent[i] == ']' && xmlContent[i + 1] == ']' &&
|
|
246
|
-
xmlContent[i + 2] == '>') {
|
|
247
|
-
i += 3;
|
|
248
|
-
return;
|
|
249
|
-
}
|
|
250
|
-
node->elementName.push_back(xmlContent[i]);
|
|
251
|
-
i++;
|
|
252
|
-
}
|
|
253
|
-
PyErr_SetString(PyExc_Exception, "unclosed token");
|
|
254
|
-
}
|
|
255
|
-
|
|
256
|
-
static void parseText(XMLNode *node, const char *xmlContent) {
|
|
257
|
-
node->type = TEXT;
|
|
258
|
-
bool isSpace = false;
|
|
259
|
-
|
|
260
|
-
while (xmlContent[i] != '\0' && xmlContent[i] != '<') {
|
|
261
|
-
if (xmlContent[i] == '&') {
|
|
262
|
-
PyErr_Format(PyExc_Exception, "not well formed (violation at pos=%d)", i);
|
|
263
|
-
return;
|
|
264
|
-
}
|
|
265
|
-
if (isSpace || !std::isspace(xmlContent[i])) {
|
|
266
|
-
node->elementName.push_back(xmlContent[i]);
|
|
267
|
-
isSpace = true;
|
|
268
|
-
}
|
|
269
|
-
i++;
|
|
270
|
-
}
|
|
271
|
-
while (std::isspace(node->elementName.back())) {
|
|
272
|
-
node->elementName.pop_back();
|
|
273
|
-
}
|
|
274
|
-
}
|
|
275
|
-
|
|
276
|
-
static std::vector<XMLNode> splitNodes(const char *xmlContent) {
|
|
277
|
-
std::vector<XMLNode> nodes;
|
|
278
|
-
i = 0;
|
|
279
|
-
|
|
280
|
-
while (xmlContent[i] != '\0') {
|
|
281
|
-
XMLNode node;
|
|
282
|
-
if (xmlContent[i] == '<') {
|
|
283
|
-
i++;
|
|
284
|
-
if (xmlContent[i] == '/') {
|
|
285
|
-
parseContainerClose(&node, xmlContent);
|
|
286
|
-
} else if (xmlContent[i] == '!') {
|
|
287
|
-
if (xmlContent[i+1] == '[') {
|
|
288
|
-
parseCData(&node, xmlContent);
|
|
289
|
-
} else {
|
|
290
|
-
parseComment(&node, xmlContent);
|
|
291
|
-
}
|
|
292
|
-
} else {
|
|
293
|
-
parseContainerOpen(&node, xmlContent);
|
|
294
|
-
}
|
|
295
|
-
} else {
|
|
296
|
-
parseText(&node, xmlContent);
|
|
297
|
-
}
|
|
298
|
-
if (!node.elementName.empty()) {
|
|
299
|
-
nodes.push_back(node);
|
|
300
|
-
}
|
|
301
|
-
}
|
|
302
|
-
|
|
303
|
-
return nodes;
|
|
304
|
-
}
|
|
305
|
-
|
|
306
|
-
static PyObject *createDict(const std::vector<Pair> &attributes, char* attributePrefix) {
|
|
307
|
-
PyObject *dict = PyDict_New();
|
|
308
|
-
for (const Pair &attr : attributes) {
|
|
309
|
-
const std::string &key = attributePrefix + attr.key;
|
|
310
|
-
PyObject *val = PyUnicode_FromString(attr.value.c_str());
|
|
311
|
-
PyDict_SetItemString(dict, key.c_str(), val);
|
|
312
|
-
}
|
|
313
|
-
|
|
314
|
-
return dict;
|
|
315
|
-
}
|
|
316
|
-
|
|
317
|
-
PyDoc_STRVAR(xml_parse_doc, "parse(xml_content: str, attr_prefix=\"@\") -> dict:\n"
|
|
318
|
-
"...\n\n"
|
|
319
|
-
"Parse XML content into a dictionary.\n\n"
|
|
320
|
-
"Args:\n\t"
|
|
321
|
-
"xml_content (str): xml document to be parsed.\n"
|
|
322
|
-
"Returns:\n\t"
|
|
323
|
-
"dict: Dictionary of the xml dom.\n");
|
|
324
|
-
static PyObject *xml_parse(PyObject *self, PyObject *args, PyObject *kwargs) {
|
|
325
|
-
const char *xmlContent;
|
|
326
|
-
char* attributePrefix = "@";
|
|
327
|
-
|
|
328
|
-
static char *kwlist[] = {"xml_content", "attr_prefix", NULL};
|
|
329
|
-
|
|
330
|
-
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|s", kwlist, &xmlContent, &attributePrefix)) {
|
|
331
|
-
return NULL;
|
|
332
|
-
}
|
|
333
|
-
|
|
334
|
-
std::vector<XMLNode> nodes = splitNodes(xmlContent);
|
|
335
|
-
if (PyErr_Occurred() != NULL) {
|
|
336
|
-
return NULL;
|
|
337
|
-
}
|
|
338
|
-
PyObject *currDict = PyDict_New();
|
|
339
|
-
std::vector<std::string> containerStackNames;
|
|
340
|
-
std::vector<PyObject *> containerStack;
|
|
341
|
-
containerStack.push_back(currDict);
|
|
342
|
-
containerStackNames.push_back("");
|
|
343
|
-
|
|
344
|
-
bool isList = false;
|
|
345
|
-
|
|
346
|
-
for (const XMLNode &node : nodes) {
|
|
347
|
-
PyObject *childKey = PyUnicode_FromString(node.elementName.c_str());
|
|
348
|
-
|
|
349
|
-
if (node.type == TEXT) {
|
|
350
|
-
PyObject *item = PyDict_GetItemString(currDict, "#text");
|
|
351
|
-
if (item != NULL) {
|
|
352
|
-
PyDict_SetItemString(currDict, "#text", PyUnicode_Concat(item, childKey));
|
|
353
|
-
} else {
|
|
354
|
-
PyDict_SetItemString(currDict, "#text", childKey);
|
|
355
|
-
}
|
|
356
|
-
} else if (node.type == CONTAINER_OPEN || node.type == PRIMITIVE) {
|
|
357
|
-
PyObject *d = createDict(node.attr, attributePrefix);
|
|
358
|
-
|
|
359
|
-
PyObject *item = PyDict_GetItem(currDict, childKey);
|
|
360
|
-
if (item != NULL) {
|
|
361
|
-
// Check if it is a List or dict
|
|
362
|
-
if (isList && PyList_Check(item)) {
|
|
363
|
-
PyList_Append(item, d);
|
|
364
|
-
} else {
|
|
365
|
-
PyObject *children = PyList_New(2);
|
|
366
|
-
PyList_SetItem(children, 0, item);
|
|
367
|
-
PyList_SetItem(children, 1, d);
|
|
368
|
-
PyDict_SetItem(currDict, childKey, children);
|
|
369
|
-
isList = true;
|
|
370
|
-
}
|
|
371
|
-
} else {
|
|
372
|
-
PyDict_SetItem(currDict, childKey, d);
|
|
373
|
-
isList = false;
|
|
374
|
-
}
|
|
375
|
-
|
|
376
|
-
if (node.type == CONTAINER_OPEN) {
|
|
377
|
-
currDict = d;
|
|
378
|
-
containerStack.push_back(d);
|
|
379
|
-
containerStackNames.push_back(node.elementName);
|
|
380
|
-
}
|
|
381
|
-
} else if (node.type == CONTAINER_CLOSE) {
|
|
382
|
-
if (containerStackNames.back() != node.elementName) {
|
|
383
|
-
PyErr_Format(PyExc_Exception,
|
|
384
|
-
"tag mismatch ('%U' does not match the last start tag)",
|
|
385
|
-
childKey);
|
|
386
|
-
}
|
|
387
|
-
containerStackNames.pop_back();
|
|
388
|
-
containerStack.pop_back();
|
|
389
|
-
currDict = containerStack.back();
|
|
390
|
-
}
|
|
391
|
-
|
|
392
|
-
Py_DECREF(childKey);
|
|
393
|
-
}
|
|
394
|
-
|
|
395
|
-
if (containerStack.size() > 1) {
|
|
396
|
-
PyErr_Format(PyExc_Exception, "not well formed (%d unclosed tags)",
|
|
397
|
-
containerStack.size() - 1);
|
|
398
|
-
return NULL;
|
|
399
|
-
}
|
|
400
|
-
|
|
401
|
-
PyObject *result = containerStack.front();
|
|
402
|
-
Py_INCREF(result);
|
|
403
|
-
return result;
|
|
404
|
-
}
|
|
405
|
-
|
|
406
|
-
static PyMethodDef XMLParserMethods[] = {
|
|
407
|
-
{"parse", (PyCFunction)xml_parse, METH_VARARGS | METH_KEYWORDS, xml_parse_doc},
|
|
408
|
-
{NULL, NULL, 0, NULL}};
|
|
409
|
-
|
|
410
|
-
static struct PyModuleDef xmlparsermodule = {PyModuleDef_HEAD_INIT, "xmlpydict",
|
|
411
|
-
NULL, -1, XMLParserMethods};
|
|
412
|
-
|
|
413
|
-
PyMODINIT_FUNC PyInit_xmlpydict() { return PyModule_Create(&xmlparsermodule); }
|
xmlpydict-0.0.8/src/xmlparse.py
DELETED
|
@@ -1,68 +0,0 @@
|
|
|
1
|
-
def parse(xml_content: str) -> dict:
|
|
2
|
-
i = 0
|
|
3
|
-
key = "@"
|
|
4
|
-
val = ""
|
|
5
|
-
xml_content += " "
|
|
6
|
-
|
|
7
|
-
curr_dict = {}
|
|
8
|
-
container_stack = [curr_dict]
|
|
9
|
-
|
|
10
|
-
while i < len(xml_content):
|
|
11
|
-
element_name = ""
|
|
12
|
-
|
|
13
|
-
if xml_content[i] == "<":
|
|
14
|
-
if xml_content[i + 1] == "/":
|
|
15
|
-
container_stack.pop()
|
|
16
|
-
curr_dict = container_stack[-1]
|
|
17
|
-
i = xml_content.find(">", i + 1)
|
|
18
|
-
elif xml_content[i + 1] == "!":
|
|
19
|
-
i = xml_content.find(">", i + 1)
|
|
20
|
-
else:
|
|
21
|
-
i += 1
|
|
22
|
-
has_attr = False
|
|
23
|
-
in_quotes = False
|
|
24
|
-
is_container = True
|
|
25
|
-
d = {}
|
|
26
|
-
while i < len(xml_content) and xml_content[i] != ">":
|
|
27
|
-
is_space = xml_content[i].isspace()
|
|
28
|
-
if xml_content[i] == "/" and xml_content[i + 1] == ">":
|
|
29
|
-
is_container = False
|
|
30
|
-
elif not has_attr and is_space:
|
|
31
|
-
has_attr = True
|
|
32
|
-
else:
|
|
33
|
-
if has_attr:
|
|
34
|
-
if xml_content[i] == "'" or xml_content[i] == '"':
|
|
35
|
-
in_quotes = not in_quotes
|
|
36
|
-
if not in_quotes and key != "" and val != "":
|
|
37
|
-
d[key] = val
|
|
38
|
-
key = "@"
|
|
39
|
-
val = ""
|
|
40
|
-
elif in_quotes:
|
|
41
|
-
val += xml_content[i]
|
|
42
|
-
elif xml_content[i] != "=" and not is_space:
|
|
43
|
-
key += xml_content[i]
|
|
44
|
-
else:
|
|
45
|
-
element_name += xml_content[i]
|
|
46
|
-
i += 1
|
|
47
|
-
item = curr_dict.get(element_name)
|
|
48
|
-
if item is None:
|
|
49
|
-
curr_dict[element_name] = d
|
|
50
|
-
else:
|
|
51
|
-
if isinstance(item, list):
|
|
52
|
-
item.append(d)
|
|
53
|
-
else:
|
|
54
|
-
curr_dict[element_name] = [item, d]
|
|
55
|
-
if is_container:
|
|
56
|
-
curr_dict = d
|
|
57
|
-
container_stack.append(d)
|
|
58
|
-
i += 1
|
|
59
|
-
else:
|
|
60
|
-
j = xml_content.find("<", i + 1)
|
|
61
|
-
if j < 0:
|
|
62
|
-
return container_stack.pop()
|
|
63
|
-
element_name = xml_content[i:j].strip()
|
|
64
|
-
i = j
|
|
65
|
-
if len(element_name) > 0:
|
|
66
|
-
curr_dict["#text"] = element_name
|
|
67
|
-
|
|
68
|
-
return container_stack.pop()
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
LICENSE
|
|
2
|
-
MANIFEST.in
|
|
3
|
-
README.md
|
|
4
|
-
pyproject.toml
|
|
5
|
-
setup.py
|
|
6
|
-
src/xmlparse.cpp
|
|
7
|
-
src/xmlparse.py
|
|
8
|
-
src/xmlpydict.egg-info/PKG-INFO
|
|
9
|
-
src/xmlpydict.egg-info/SOURCES.txt
|
|
10
|
-
src/xmlpydict.egg-info/dependency_links.txt
|
|
11
|
-
src/xmlpydict.egg-info/requires.txt
|
|
12
|
-
src/xmlpydict.egg-info/top_level.txt
|
|
13
|
-
tests/test.py
|
|
14
|
-
tests/test_parse.py
|
xmlpydict-0.0.8/tests/test.py
DELETED
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
import xmlpydict
|
|
2
|
-
import xmltodict
|
|
3
|
-
import timeit
|
|
4
|
-
|
|
5
|
-
s = """<svg xmlns="http://www.w3.org/2000/svg" width="400" height="400">
|
|
6
|
-
<rect x="50" y="50" width="100" height="50" fill="blue" />
|
|
7
|
-
<circle cx="200" cy="100" r="50" fill="red" />
|
|
8
|
-
<ellipse cx="350" cy="75" rx="50" ry="25" fill="green" />
|
|
9
|
-
<line x1="50" y1="200" x2="150" y2="300" stroke="orange" />
|
|
10
|
-
<polyline points="200,200 250,250 300,200 350,250" fill="none" stroke="purple" />
|
|
11
|
-
<polygon points="350,200 400,250 400,150" fill="yellow" />
|
|
12
|
-
<path d="M50,350 L100,350 Q125,375 150,350 T200,350" fill="none" stroke="black"/>
|
|
13
|
-
|
|
14
|
-
<rect x="10" y="10" height="100" width="100"
|
|
15
|
-
style="stroke:#ff0000; fill: #0000ff"/>
|
|
16
|
-
<path d="M50,350 L100,350 Q125,375 150,350 T200,350" fill="none" stroke="black"/><polygon points="350,200 400,250 400,150" fill="yellow" />
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
<circle cx="200" cy="100" r="50" fill="red"></circle>
|
|
20
|
-
|
|
21
|
-
<polygon points="350,200 400,250 400,150" fill="yellow" />
|
|
22
|
-
</svg>"""
|
|
23
|
-
print(timeit.timeit(lambda: xmlpydict.parse(s), number=100))
|
|
24
|
-
print(timeit.timeit(lambda: xmltodict.parse(s), number=100))
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|