xmlpydict 0.0.7__tar.gz → 0.0.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xmlpydict-0.0.11/PKG-INFO +111 -0
- xmlpydict-0.0.11/README.md +84 -0
- {xmlpydict-0.0.7 → xmlpydict-0.0.11}/pyproject.toml +3 -3
- {xmlpydict-0.0.7 → xmlpydict-0.0.11}/setup.py +2 -2
- xmlpydict-0.0.11/src/xmlparse.cpp +222 -0
- {xmlpydict-0.0.7 → xmlpydict-0.0.11}/tests/test_parse.py +88 -20
- xmlpydict-0.0.11/xmlpydict/__init__.py +45 -0
- xmlpydict-0.0.11/xmlpydict.egg-info/PKG-INFO +111 -0
- xmlpydict-0.0.11/xmlpydict.egg-info/SOURCES.txt +13 -0
- {xmlpydict-0.0.7/src → xmlpydict-0.0.11}/xmlpydict.egg-info/requires.txt +1 -1
- xmlpydict-0.0.11/xmlpydict.egg-info/top_level.txt +2 -0
- xmlpydict-0.0.7/PKG-INFO +0 -70
- xmlpydict-0.0.7/README.md +0 -44
- xmlpydict-0.0.7/src/xmlparse.cpp +0 -378
- xmlpydict-0.0.7/src/xmlparse.py +0 -68
- xmlpydict-0.0.7/src/xmlpydict.egg-info/PKG-INFO +0 -70
- xmlpydict-0.0.7/src/xmlpydict.egg-info/SOURCES.txt +0 -14
- xmlpydict-0.0.7/src/xmlpydict.egg-info/top_level.txt +0 -2
- xmlpydict-0.0.7/tests/test.py +0 -24
- {xmlpydict-0.0.7 → xmlpydict-0.0.11}/LICENSE +0 -0
- {xmlpydict-0.0.7 → xmlpydict-0.0.11}/MANIFEST.in +0 -0
- {xmlpydict-0.0.7 → xmlpydict-0.0.11}/setup.cfg +0 -0
- {xmlpydict-0.0.7/src → xmlpydict-0.0.11}/xmlpydict.egg-info/dependency_links.txt +0 -0
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: xmlpydict
|
|
3
|
+
Version: 0.0.11
|
|
4
|
+
Summary: xml to dictionary tool for python
|
|
5
|
+
Author-email: Matthew Taylor <matthew.taylor.andre@gmail.com>
|
|
6
|
+
Project-URL: Homepage, https://github.com/MatthewAndreTaylor/xml-to-pydict
|
|
7
|
+
Keywords: xml,dictionary
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
+
Classifier: Topic :: Text Processing :: Markup :: XML
|
|
20
|
+
Requires-Python: >=3.7
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Provides-Extra: tests
|
|
24
|
+
Requires-Dist: pytest; extra == "tests"
|
|
25
|
+
Requires-Dist: requests; extra == "tests"
|
|
26
|
+
Dynamic: license-file
|
|
27
|
+
|
|
28
|
+
# xmlpydict 📑
|
|
29
|
+
|
|
30
|
+
[](https://github.com/MatthewAndreTaylor/xml-to-pydict/actions/workflows/tests.yml)
|
|
31
|
+
[](https://github.com/MatthewAndreTaylor/xml-to-pydict)
|
|
32
|
+
[](https://pypi.org/project/xmlpydict/)
|
|
33
|
+
|
|
34
|
+
## Requirements
|
|
35
|
+
|
|
36
|
+
- `python 3.8+`
|
|
37
|
+
|
|
38
|
+
## Installation
|
|
39
|
+
|
|
40
|
+
To install xmlpydict, using pip:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install xmlpydict
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Quickstart
|
|
47
|
+
|
|
48
|
+
```py
|
|
49
|
+
>>> from xmlpydict import parse
|
|
50
|
+
>>> parse("<package><xmlpydict language='python'/></package>")
|
|
51
|
+
{'package': {'xmlpydict': {'@language': 'python'}}}
|
|
52
|
+
>>> parse("<person name='Matthew'>Hello!</person>")
|
|
53
|
+
{'person': {'@name': 'Matthew', '#text': 'Hello!'}}
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Goals
|
|
57
|
+
|
|
58
|
+
Create a consistent parsing strategy between XML and Python dictionaries using the specification found [here](https://www.xml.com/pub/a/2006/05/31/converting-between-xml-and-json.html). `xmlpydict` focuses on speed; see the benchmarks below.
|
|
59
|
+
|
|
60
|
+
<img width="256" alt="small_xml_document" src="https://github.com/user-attachments/assets/0248a408-6bb6-4790-bd0f-f90537e2f21a" />
|
|
61
|
+
<img width="256" alt="large_xml_document" src="https://github.com/user-attachments/assets/539a2a69-f475-46a5-bffc-1e8805a5a5e7" />
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
### xmlpydict supports the following
|
|
65
|
+
|
|
66
|
+
[CDataSection](https://www.w3.org/TR/xml/#sec-cdata-sect): CDATA Sections are stored as {'#text': CData}.
|
|
67
|
+
|
|
68
|
+
[Comments](https://www.w3.org/TR/xml/#sec-comments): Comments are tokenized for corectness, but have no effect in what is returned.
|
|
69
|
+
|
|
70
|
+
[Element Tags](https://www.w3.org/TR/xml/#sec-starttags): Allows for duplicate attributes, however only the latest defined will be taken.
|
|
71
|
+
|
|
72
|
+
[Characters](https://www.w3.org/TR/xml/#charsets): Similar to CDATA text is stored as {'#text': Char} , however this text is stripped.
|
|
73
|
+
|
|
74
|
+
```py
|
|
75
|
+
# Empty tags are containers
|
|
76
|
+
>>> from xmlpydict import parse
|
|
77
|
+
>>> parse("<a></a>")
|
|
78
|
+
{'a': None}
|
|
79
|
+
>>> parse("<a/>")
|
|
80
|
+
{'a': None}
|
|
81
|
+
>>> parse("<a/>").get('href')
|
|
82
|
+
None
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Attribute prefixing
|
|
86
|
+
|
|
87
|
+
```py
|
|
88
|
+
# Change prefix from default "@" with keyword argument attr_prefix
|
|
89
|
+
>>> from xmlpydict import parse
|
|
90
|
+
>>> parse('<p width="10" height="5"></p>', attr_prefix="$")
|
|
91
|
+
{"p": {"$width": "10", "$height": "5"}}
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
### Exceptions
|
|
96
|
+
|
|
97
|
+
```py
|
|
98
|
+
# Grammar and structure of the xml_content is checked while parsing
|
|
99
|
+
>>> from xmlpydict import parse
|
|
100
|
+
>>> parse("<a></ a>")
|
|
101
|
+
xml.parsers.expat.ExpatError: not well-formed (invalid token): line 1, column 5
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
### Unsupported
|
|
106
|
+
|
|
107
|
+
Prolog / Enforcing Document Type Definition and Element Type Declarations
|
|
108
|
+
|
|
109
|
+
Entity Referencing
|
|
110
|
+
|
|
111
|
+
Namespaces
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# xmlpydict 📑
|
|
2
|
+
|
|
3
|
+
[](https://github.com/MatthewAndreTaylor/xml-to-pydict/actions/workflows/tests.yml)
|
|
4
|
+
[](https://github.com/MatthewAndreTaylor/xml-to-pydict)
|
|
5
|
+
[](https://pypi.org/project/xmlpydict/)
|
|
6
|
+
|
|
7
|
+
## Requirements
|
|
8
|
+
|
|
9
|
+
- `python 3.8+`
|
|
10
|
+
|
|
11
|
+
## Installation
|
|
12
|
+
|
|
13
|
+
To install xmlpydict, using pip:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install xmlpydict
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Quickstart
|
|
20
|
+
|
|
21
|
+
```py
|
|
22
|
+
>>> from xmlpydict import parse
|
|
23
|
+
>>> parse("<package><xmlpydict language='python'/></package>")
|
|
24
|
+
{'package': {'xmlpydict': {'@language': 'python'}}}
|
|
25
|
+
>>> parse("<person name='Matthew'>Hello!</person>")
|
|
26
|
+
{'person': {'@name': 'Matthew', '#text': 'Hello!'}}
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Goals
|
|
30
|
+
|
|
31
|
+
Create a consistent parsing strategy between XML and Python dictionaries using the specification found [here](https://www.xml.com/pub/a/2006/05/31/converting-between-xml-and-json.html). `xmlpydict` focuses on speed; see the benchmarks below.
|
|
32
|
+
|
|
33
|
+
<img width="256" alt="small_xml_document" src="https://github.com/user-attachments/assets/0248a408-6bb6-4790-bd0f-f90537e2f21a" />
|
|
34
|
+
<img width="256" alt="large_xml_document" src="https://github.com/user-attachments/assets/539a2a69-f475-46a5-bffc-1e8805a5a5e7" />
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
### xmlpydict supports the following
|
|
38
|
+
|
|
39
|
+
[CDataSection](https://www.w3.org/TR/xml/#sec-cdata-sect): CDATA Sections are stored as {'#text': CData}.
|
|
40
|
+
|
|
41
|
+
[Comments](https://www.w3.org/TR/xml/#sec-comments): Comments are tokenized for corectness, but have no effect in what is returned.
|
|
42
|
+
|
|
43
|
+
[Element Tags](https://www.w3.org/TR/xml/#sec-starttags): Allows for duplicate attributes, however only the latest defined will be taken.
|
|
44
|
+
|
|
45
|
+
[Characters](https://www.w3.org/TR/xml/#charsets): Similar to CDATA text is stored as {'#text': Char} , however this text is stripped.
|
|
46
|
+
|
|
47
|
+
```py
|
|
48
|
+
# Empty tags are containers
|
|
49
|
+
>>> from xmlpydict import parse
|
|
50
|
+
>>> parse("<a></a>")
|
|
51
|
+
{'a': None}
|
|
52
|
+
>>> parse("<a/>")
|
|
53
|
+
{'a': None}
|
|
54
|
+
>>> parse("<a/>").get('href')
|
|
55
|
+
None
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Attribute prefixing
|
|
59
|
+
|
|
60
|
+
```py
|
|
61
|
+
# Change prefix from default "@" with keyword argument attr_prefix
|
|
62
|
+
>>> from xmlpydict import parse
|
|
63
|
+
>>> parse('<p width="10" height="5"></p>', attr_prefix="$")
|
|
64
|
+
{"p": {"$width": "10", "$height": "5"}}
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
### Exceptions
|
|
69
|
+
|
|
70
|
+
```py
|
|
71
|
+
# Grammar and structure of the xml_content is checked while parsing
|
|
72
|
+
>>> from xmlpydict import parse
|
|
73
|
+
>>> parse("<a></ a>")
|
|
74
|
+
xml.parsers.expat.ExpatError: not well-formed (invalid token): line 1, column 5
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
### Unsupported
|
|
79
|
+
|
|
80
|
+
Prolog / Enforcing Document Type Definition and Element Type Declarations
|
|
81
|
+
|
|
82
|
+
Entity Referencing
|
|
83
|
+
|
|
84
|
+
Namespaces
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "xmlpydict"
|
|
7
|
-
version = "0.0.
|
|
7
|
+
version = "0.0.11"
|
|
8
8
|
description="xml to dictionary tool for python"
|
|
9
9
|
authors = [
|
|
10
10
|
{name = "Matthew Taylor", email = "matthew.taylor.andre@gmail.com"},
|
|
@@ -18,13 +18,13 @@ classifiers = [
|
|
|
18
18
|
"License :: OSI Approved :: MIT License",
|
|
19
19
|
"Programming Language :: Python :: 3",
|
|
20
20
|
"Programming Language :: Python :: 3 :: Only",
|
|
21
|
-
"Programming Language :: Python :: 3.7",
|
|
22
21
|
"Programming Language :: Python :: 3.8",
|
|
23
22
|
"Programming Language :: Python :: 3.9",
|
|
24
23
|
"Programming Language :: Python :: 3.10",
|
|
25
24
|
"Programming Language :: Python :: 3.11",
|
|
26
25
|
"Programming Language :: Python :: Implementation :: CPython",
|
|
27
26
|
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
27
|
+
"Topic :: Text Processing :: Markup :: XML",
|
|
28
28
|
]
|
|
29
29
|
|
|
30
30
|
[project.readme]
|
|
@@ -32,4 +32,4 @@ file = "README.md"
|
|
|
32
32
|
content-type = "text/markdown"
|
|
33
33
|
|
|
34
34
|
[project.optional-dependencies]
|
|
35
|
-
tests = [ "pytest", "
|
|
35
|
+
tests = [ "pytest", "requests" ]
|
|
@@ -16,8 +16,8 @@ class build_ext(build_ext_orig):
|
|
|
16
16
|
setup(
|
|
17
17
|
include_package_data=True,
|
|
18
18
|
ext_modules=[
|
|
19
|
-
Extension("
|
|
19
|
+
Extension("pyxmlhandler", ["src/xmlparse.cpp"]),
|
|
20
20
|
],
|
|
21
21
|
cmdclass={"build_ext": build_ext},
|
|
22
|
-
|
|
22
|
+
packages=["xmlpydict"],
|
|
23
23
|
)
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) 2023 Matthew Andre Taylor
|
|
3
|
+
*/
|
|
4
|
+
#include <Python.h>
|
|
5
|
+
#include <stdio.h>
|
|
6
|
+
#include <cctype>
|
|
7
|
+
#include <vector>
|
|
8
|
+
|
|
9
|
+
static PyObject* strip(PyObject* s_obj) {
|
|
10
|
+
Py_ssize_t start = 0;
|
|
11
|
+
Py_ssize_t end = PyUnicode_GetLength(s_obj);
|
|
12
|
+
while (start < end && std::isspace(PyUnicode_ReadChar(s_obj, start))) {
|
|
13
|
+
++start;
|
|
14
|
+
}
|
|
15
|
+
while (end > start && std::isspace(PyUnicode_ReadChar(s_obj, end - 1))) {
|
|
16
|
+
--end;
|
|
17
|
+
}
|
|
18
|
+
return PyUnicode_Substring(s_obj, start, end);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
typedef struct {
|
|
22
|
+
PyObject_HEAD PyObject* item; // current dict
|
|
23
|
+
PyObject* data; // character data buffer
|
|
24
|
+
std::vector<PyObject*> item_stack;
|
|
25
|
+
std::vector<PyObject*> data_stack;
|
|
26
|
+
PyObject* attr_prefix;
|
|
27
|
+
PyObject* cdata_key;
|
|
28
|
+
} PyDictHandler;
|
|
29
|
+
|
|
30
|
+
static PyObject* PyDictHandler_new(PyTypeObject* type, PyObject* args,
|
|
31
|
+
PyObject* kwargs) {
|
|
32
|
+
PyDictHandler* self;
|
|
33
|
+
self = (PyDictHandler*)type->tp_alloc(type, 0);
|
|
34
|
+
return (PyObject*)self;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
static int PyDictHandler_init(PyDictHandler* self, PyObject* args,
|
|
38
|
+
PyObject* kwargs) {
|
|
39
|
+
const char* attr_prefix = "@";
|
|
40
|
+
const char* cdata_key = "#text";
|
|
41
|
+
static char* kwlist[] = {"attr_prefix", "cdata_key", NULL};
|
|
42
|
+
|
|
43
|
+
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss", kwlist,
|
|
44
|
+
&attr_prefix, &cdata_key))
|
|
45
|
+
return -1;
|
|
46
|
+
|
|
47
|
+
self->item = Py_None;
|
|
48
|
+
self->data = PyUnicode_New(0, 127); // empty string
|
|
49
|
+
self->attr_prefix = PyUnicode_FromString(attr_prefix);
|
|
50
|
+
self->cdata_key = PyUnicode_FromString(cdata_key);
|
|
51
|
+
return 0;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
static PyObject* characters(PyDictHandler* self, PyObject* data_obj) {
|
|
55
|
+
PyUnicode_Append(&self->data, data_obj);
|
|
56
|
+
Py_RETURN_NONE;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
static PyObject* startElement(PyDictHandler* self, PyObject* args) {
|
|
60
|
+
self->item_stack.push_back(self->item);
|
|
61
|
+
self->data_stack.push_back(self->data);
|
|
62
|
+
self->data = PyUnicode_New(0, 127); // reset data buffer
|
|
63
|
+
|
|
64
|
+
const char* name;
|
|
65
|
+
PyObject* attrs;
|
|
66
|
+
if (!PyArg_ParseTuple(args, "sO", &name, &attrs)) {
|
|
67
|
+
return NULL;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
if (!PyDict_Check(attrs) || PyDict_Size(attrs) == 0) {
|
|
71
|
+
self->item = Py_None;
|
|
72
|
+
Py_RETURN_NONE;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
PyObject* newDict = PyDict_New();
|
|
76
|
+
PyObject *key, *value;
|
|
77
|
+
Py_ssize_t pos = 0;
|
|
78
|
+
|
|
79
|
+
while (PyDict_Next(attrs, &pos, &key, &value)) {
|
|
80
|
+
PyObject* prefixed_key = PyUnicode_Concat(self->attr_prefix, key);
|
|
81
|
+
PyDict_SetItem(newDict, prefixed_key, value);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
self->item = newDict;
|
|
85
|
+
Py_RETURN_NONE;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
static PyObject* updateChildren(PyObject*& target, PyObject* key, PyObject* value) {
|
|
89
|
+
if (target == Py_None) {
|
|
90
|
+
target = PyDict_New();
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
if (!PyDict_Contains(target, key)) {
|
|
94
|
+
PyDict_SetItem(target, key, value);
|
|
95
|
+
} else {
|
|
96
|
+
PyObject* existing = PyDict_GetItem(target, key);
|
|
97
|
+
if (PyList_Check(existing)) {
|
|
98
|
+
PyList_Append(existing, value);
|
|
99
|
+
} else {
|
|
100
|
+
PyObject* newList = PyList_New(2);
|
|
101
|
+
PyList_SetItem(newList, 0, existing);
|
|
102
|
+
PyList_SetItem(newList, 1, value);
|
|
103
|
+
PyDict_SetItem(target, key, newList);
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
return target;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
static PyObject* endElement(PyDictHandler* self, PyObject* name_obj) {
|
|
110
|
+
if (!self->data_stack.empty()) {
|
|
111
|
+
PyObject* temp_data = strip(self->data);
|
|
112
|
+
bool has_data = (PyUnicode_GetLength(temp_data) > 0);
|
|
113
|
+
PyObject* py_data = has_data ? temp_data : Py_None;
|
|
114
|
+
PyObject* temp_item = self->item;
|
|
115
|
+
|
|
116
|
+
self->item = self->item_stack.back();
|
|
117
|
+
self->data = self->data_stack.back();
|
|
118
|
+
self->item_stack.pop_back();
|
|
119
|
+
self->data_stack.pop_back();
|
|
120
|
+
|
|
121
|
+
if (temp_item != Py_None) {
|
|
122
|
+
if (has_data) {
|
|
123
|
+
PyDict_SetItem(temp_item, self->cdata_key, py_data);
|
|
124
|
+
}
|
|
125
|
+
temp_item = PyDict_Copy(temp_item);
|
|
126
|
+
self->item = updateChildren(self->item, name_obj, temp_item);
|
|
127
|
+
}
|
|
128
|
+
else {
|
|
129
|
+
self->item = updateChildren(self->item, name_obj, py_data);
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
Py_RETURN_NONE;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
static PyMethodDef PyDictHandler_methods[] = {
|
|
138
|
+
{"characters", (PyCFunction)characters, METH_O, "Handle character data"},
|
|
139
|
+
{"startElement", (PyCFunction)startElement, METH_VARARGS, "Handle start of an element"},
|
|
140
|
+
{"endElement", (PyCFunction)endElement, METH_O, "Handle end of an element"},
|
|
141
|
+
{NULL, NULL, 0, NULL}
|
|
142
|
+
};
|
|
143
|
+
|
|
144
|
+
static PyObject* PyDictHandler_get_item(PyDictHandler *self, void *closure)
|
|
145
|
+
{
|
|
146
|
+
Py_INCREF(self->item);
|
|
147
|
+
return self->item;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
static PyGetSetDef PyDictHandler_getset[] = {
|
|
151
|
+
{
|
|
152
|
+
"item", /* name */
|
|
153
|
+
(getter)PyDictHandler_get_item, /* get */
|
|
154
|
+
NULL, /* set */
|
|
155
|
+
NULL, /* doc */
|
|
156
|
+
NULL /* closure */
|
|
157
|
+
},
|
|
158
|
+
{NULL} /* Sentinel */
|
|
159
|
+
};
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
static PyTypeObject PyDictHandlerType = {
|
|
163
|
+
PyVarObject_HEAD_INIT(NULL, 0) "pyxmlhandler._PyDictHandler", // tp_name
|
|
164
|
+
sizeof(PyDictHandler), // tp_basicsize
|
|
165
|
+
0, // tp_itemsize
|
|
166
|
+
0, // tp_dealloc
|
|
167
|
+
0, // tp_vectorcall_offset
|
|
168
|
+
0, // tp_getattr
|
|
169
|
+
0, // tp_setattr
|
|
170
|
+
0, // tp_as_async
|
|
171
|
+
0, // tp_repr
|
|
172
|
+
0, // tp_as_number
|
|
173
|
+
0, // tp_as_sequence
|
|
174
|
+
0, // tp_as_mapping
|
|
175
|
+
0, // tp_hash
|
|
176
|
+
0, // tp_call
|
|
177
|
+
0, // tp_str
|
|
178
|
+
0, // tp_getattro
|
|
179
|
+
0, // tp_setattro
|
|
180
|
+
0, // tp_as_buffer
|
|
181
|
+
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, // tp_flags
|
|
182
|
+
"Handler that converts XML to Python dict", // tp_doc
|
|
183
|
+
0, // tp_traverse
|
|
184
|
+
0, // tp_clear
|
|
185
|
+
0, // tp_richcompare
|
|
186
|
+
0, // tp_weaklistoffset
|
|
187
|
+
0, // tp_iter
|
|
188
|
+
0, // tp_iternext
|
|
189
|
+
PyDictHandler_methods, // tp_methods
|
|
190
|
+
0, // tp_members
|
|
191
|
+
PyDictHandler_getset, // tp_getset
|
|
192
|
+
0, // tp_base
|
|
193
|
+
0, // tp_dict
|
|
194
|
+
0, // tp_descr_get
|
|
195
|
+
0, // tp_descr_set
|
|
196
|
+
0, // tp_dictoffset
|
|
197
|
+
(initproc)PyDictHandler_init, // tp_init
|
|
198
|
+
0, // tp_alloc
|
|
199
|
+
PyDictHandler_new, // tp_new
|
|
200
|
+
};
|
|
201
|
+
|
|
202
|
+
static PyModuleDef pyxmlhandlermodule = {
|
|
203
|
+
PyModuleDef_HEAD_INIT,
|
|
204
|
+
"pyxmlhandler",
|
|
205
|
+
"Module that provides XML to Python dict parsing",
|
|
206
|
+
-1,
|
|
207
|
+
NULL, NULL, NULL, NULL, NULL
|
|
208
|
+
};
|
|
209
|
+
|
|
210
|
+
PyMODINIT_FUNC PyInit_pyxmlhandler(void) {
|
|
211
|
+
PyObject* m;
|
|
212
|
+
if (PyType_Ready(&PyDictHandlerType) < 0)
|
|
213
|
+
return NULL;
|
|
214
|
+
|
|
215
|
+
m = PyModule_Create(&pyxmlhandlermodule);
|
|
216
|
+
if (m == NULL)
|
|
217
|
+
return NULL;
|
|
218
|
+
|
|
219
|
+
Py_INCREF(&PyDictHandlerType);
|
|
220
|
+
PyModule_AddObject(m, "_PyDictHandler", (PyObject*)&PyDictHandlerType);
|
|
221
|
+
return m;
|
|
222
|
+
}
|
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
import pytest
|
|
2
|
-
from xmlpydict import parse
|
|
3
2
|
import json
|
|
3
|
+
from xmlpydict import parse
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def test_simple():
|
|
7
|
-
assert parse("") == {}
|
|
8
|
-
assert parse("<p
|
|
9
|
-
assert parse("<p></p>") == {"p": {}}
|
|
7
|
+
assert parse("<p/>") == {"p": None}
|
|
8
|
+
assert parse("<p></p>") == {"p": None}
|
|
10
9
|
assert parse('<p width="10"></p>') == {"p": {"@width": "10"}}
|
|
10
|
+
assert parse("<p>Hello</p>") == {"p": "Hello"}
|
|
11
|
+
|
|
11
12
|
assert parse('<p width="10">Hello World</p>') == {
|
|
12
13
|
"p": {"@width": "10", "#text": "Hello World"}
|
|
13
14
|
}
|
|
@@ -21,7 +22,18 @@ def test_simple():
|
|
|
21
22
|
"p": {"@width": "10", "@height": "20"}
|
|
22
23
|
}
|
|
23
24
|
assert parse("<p>Hey <b>bold</b>There</p>") == {
|
|
24
|
-
"p": {"#text": "
|
|
25
|
+
"p": {"#text": "Hey There", "b": "bold"}
|
|
26
|
+
}
|
|
27
|
+
assert parse("<p>Hey <b>bold</b>There <b>bold</b>Buddy </p>") == {
|
|
28
|
+
"p": {"#text": "Hey There Buddy", "b": ["bold", "bold"]}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
assert parse("<p>Hey <b/>There Buddy</p>") == {
|
|
32
|
+
"p": {"#text": "Hey There Buddy", "b": None}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
assert parse("<p>Hey <b/>There Buddy <b/> </p>") == {
|
|
36
|
+
"p": {"#text": "Hey There Buddy", "b": [None, None]}
|
|
25
37
|
}
|
|
26
38
|
|
|
27
39
|
assert (
|
|
@@ -66,15 +78,18 @@ def test_simple():
|
|
|
66
78
|
)
|
|
67
79
|
|
|
68
80
|
|
|
69
|
-
def
|
|
70
|
-
assert parse("<
|
|
71
|
-
|
|
72
|
-
assert parse("<book><p></p></book><card/>") == {"book": {"p": {}}, "card": {}}
|
|
73
|
-
assert parse("<pizza></pizza><book><p></p></book><card/>") == {
|
|
74
|
-
"pizza": {},
|
|
75
|
-
"book": {"p": {}},
|
|
76
|
-
"card": {},
|
|
81
|
+
def test_cdata():
|
|
82
|
+
assert parse("<content><![CDATA[<p>This is a paragraph</p>]]></content>") == {
|
|
83
|
+
"content": "<p>This is a paragraph</p>"
|
|
77
84
|
}
|
|
85
|
+
assert parse(
|
|
86
|
+
"<special_chars><![CDATA[$ ^ * % & <> () + - + ` ~]]></special_chars>"
|
|
87
|
+
) == {"special_chars": "$ ^ * % & <> () + - + ` ~"}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def test_nested():
|
|
91
|
+
assert parse("<book><p/></book> ") == {"book": {"p": None}}
|
|
92
|
+
assert parse("<book><p></p></book>") == {"book": {"p": None}}
|
|
78
93
|
|
|
79
94
|
|
|
80
95
|
def test_list():
|
|
@@ -89,12 +104,20 @@ def test_list():
|
|
|
89
104
|
|
|
90
105
|
|
|
91
106
|
def test_comment():
|
|
92
|
-
assert parse("
|
|
107
|
+
assert parse("<p/><!-- simple comment -->") == {"p": None}
|
|
93
108
|
comment = """<world>
|
|
94
109
|
<!-- $comment+++@python -->
|
|
95
110
|
<lake>Content</lake>
|
|
96
111
|
</world>"""
|
|
97
|
-
assert parse(comment) == {"world": {"lake":
|
|
112
|
+
assert parse(comment) == {"world": {"lake": "Content"}}
|
|
113
|
+
multiple_comments = """<book>
|
|
114
|
+
<!-- Comment 0 -->
|
|
115
|
+
<!-- Comment 1 -->
|
|
116
|
+
<lines>510</lines>
|
|
117
|
+
<!-- Comment 2 -->
|
|
118
|
+
<!-- -->
|
|
119
|
+
</book>"""
|
|
120
|
+
assert parse(multiple_comments) == {"book": {"lines": "510"}}
|
|
98
121
|
|
|
99
122
|
|
|
100
123
|
def test_files():
|
|
@@ -269,16 +292,61 @@ def test_files():
|
|
|
269
292
|
|
|
270
293
|
def test_exception():
|
|
271
294
|
xml_strings = [
|
|
272
|
-
"< p/>",
|
|
273
|
-
"<p>",
|
|
274
|
-
"<p/ >",
|
|
275
295
|
"<p height'10'/>",
|
|
276
296
|
"<p height='10'width='5'/>",
|
|
277
|
-
"<p width='5/>",
|
|
278
297
|
"<p width=5'/>",
|
|
279
|
-
"</p>",
|
|
280
298
|
"<pwidth='5'/>",
|
|
299
|
+
"<!---->",
|
|
300
|
+
"<a></p>",
|
|
301
|
+
"<></>",
|
|
302
|
+
"</>",
|
|
303
|
+
"<",
|
|
304
|
+
">",
|
|
305
|
+
"<nested></p></nested>",
|
|
281
306
|
]
|
|
282
307
|
for xml_str in xml_strings:
|
|
283
308
|
with pytest.raises(Exception):
|
|
284
309
|
parse(xml_str)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def test_prefix():
|
|
313
|
+
assert parse("<p></p>", attr_prefix="$") == {"p": None}
|
|
314
|
+
assert parse('<p width="10"></p>', attr_prefix="$") == {"p": {"$width": "10"}}
|
|
315
|
+
assert parse('<p width="10" height="5"></p>', attr_prefix="$") == {
|
|
316
|
+
"p": {"$width": "10", "$height": "5"}
|
|
317
|
+
}
|
|
318
|
+
assert parse('<p width="10" height="5"></p>', attr_prefix="$$$$$$$$$") == {
|
|
319
|
+
"p": {"$$$$$$$$$width": "10", "$$$$$$$$$height": "5"}
|
|
320
|
+
}
|
|
321
|
+
assert parse('<p width="10" height="5"></p>', attr_prefix="") == {
|
|
322
|
+
"p": {"width": "10", "height": "5"}
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def test_document():
|
|
327
|
+
s = """<?xml version="1.0" encoding="UTF-8"?><repository>
|
|
328
|
+
<project pypi="xmlpydict">
|
|
329
|
+
<title>XML document parser</title>
|
|
330
|
+
<author>Matthew Taylor</author>
|
|
331
|
+
</project>
|
|
332
|
+
<project pypi="blank">
|
|
333
|
+
<title>Test project</title>
|
|
334
|
+
<author>Matthew Taylor</author>
|
|
335
|
+
</project>
|
|
336
|
+
</repository>"""
|
|
337
|
+
assert parse(s) == {
|
|
338
|
+
"repository": {
|
|
339
|
+
"project": [
|
|
340
|
+
{
|
|
341
|
+
"@pypi": "xmlpydict",
|
|
342
|
+
"title": "XML document parser",
|
|
343
|
+
"author": "Matthew Taylor",
|
|
344
|
+
},
|
|
345
|
+
{
|
|
346
|
+
"@pypi": "blank",
|
|
347
|
+
"title": "Test project",
|
|
348
|
+
"author": "Matthew Taylor",
|
|
349
|
+
},
|
|
350
|
+
]
|
|
351
|
+
}
|
|
352
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from pyxmlhandler import _PyDictHandler
|
|
2
|
+
from xml.parsers import expat
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def parse(xml_content, attr_prefix: str = "@", cdata_key: str = "#text") -> dict:
|
|
6
|
+
"""
|
|
7
|
+
Parse XML content into a python dictionary.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
xml_content: The XML content to be parsed.
|
|
11
|
+
attr_prefix: The prefix to use for attributes in the resulting dictionary.
|
|
12
|
+
cdata_key: The key to use for character data in the resulting dictionary.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
A dictionary representation of the XML content.
|
|
16
|
+
"""
|
|
17
|
+
handler = _PyDictHandler(attr_prefix=attr_prefix, cdata_key=cdata_key)
|
|
18
|
+
parser = expat.ParserCreate()
|
|
19
|
+
parser.CharacterDataHandler = handler.characters
|
|
20
|
+
parser.StartElementHandler = handler.startElement
|
|
21
|
+
parser.EndElementHandler = handler.endElement
|
|
22
|
+
parser.Parse(xml_content, True)
|
|
23
|
+
return handler.item
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def parse_file(file_path, attr_prefix: str = "@", cdata_key: str = "#text") -> dict:
|
|
27
|
+
"""
|
|
28
|
+
Parse an XML file into a python dictionary.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
file_path: The path to the XML file to be parsed.
|
|
32
|
+
attr_prefix: The prefix to use for attributes in the resulting dictionary.
|
|
33
|
+
cdata_key: The key to use for character data in the resulting dictionary.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
A dictionary representation of the XML file content.
|
|
37
|
+
"""
|
|
38
|
+
handler = _PyDictHandler(attr_prefix=attr_prefix, cdata_key=cdata_key)
|
|
39
|
+
parser = expat.ParserCreate()
|
|
40
|
+
parser.CharacterDataHandler = handler.characters
|
|
41
|
+
parser.StartElementHandler = handler.startElement
|
|
42
|
+
parser.EndElementHandler = handler.endElement
|
|
43
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
44
|
+
parser.ParseFile(f)
|
|
45
|
+
return handler.item
|