scrapery 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapery-0.0.1/PKG-INFO +227 -0
- scrapery-0.0.1/README.md +193 -0
- scrapery-0.0.1/scrapery/__init__.py +18 -0
- scrapery-0.0.1/scrapery/exceptions.py +40 -0
- scrapery-0.0.1/scrapery/html_api.py +169 -0
- scrapery-0.0.1/scrapery/html_elements.py +159 -0
- scrapery-0.0.1/scrapery/json_api.py +41 -0
- scrapery-0.0.1/scrapery/json_elements.py +71 -0
- scrapery-0.0.1/scrapery/utils.py +110 -0
- scrapery-0.0.1/scrapery/xml_api.py +55 -0
- scrapery-0.0.1/scrapery/xml_elements.py +86 -0
- scrapery-0.0.1/scrapery.egg-info/PKG-INFO +227 -0
- scrapery-0.0.1/scrapery.egg-info/SOURCES.txt +17 -0
- scrapery-0.0.1/scrapery.egg-info/dependency_links.txt +1 -0
- scrapery-0.0.1/scrapery.egg-info/not-zip-safe +1 -0
- scrapery-0.0.1/scrapery.egg-info/requires.txt +5 -0
- scrapery-0.0.1/scrapery.egg-info/top_level.txt +1 -0
- scrapery-0.0.1/setup.cfg +4 -0
- scrapery-0.0.1/setup.py +39 -0
scrapery-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: scrapery
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Scrapery: A fast, lightweight library to scrape HTML, XML, and JSON using XPath, CSS selectors, and intuitive DOM navigation.
|
|
5
|
+
Author: Ramesh Chandra
|
|
6
|
+
Author-email: rameshsofter@gmail.com
|
|
7
|
+
License: MIT
|
|
8
|
+
Keywords: web scraping,html parser,xml parser,json parser,aiohttp,lxml,ujson,data extraction,scraping tools
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
14
|
+
Classifier: Topic :: Utilities
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Natural Language :: English
|
|
17
|
+
Requires-Python: >=3.8
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
Requires-Dist: lxml>=4.9.2
|
|
20
|
+
Requires-Dist: ujson>=5.8.0
|
|
21
|
+
Requires-Dist: aiohttp>=3.8.5
|
|
22
|
+
Requires-Dist: chardet>=5.1.0
|
|
23
|
+
Requires-Dist: jmespath>=1.0.1
|
|
24
|
+
Dynamic: author
|
|
25
|
+
Dynamic: author-email
|
|
26
|
+
Dynamic: classifier
|
|
27
|
+
Dynamic: description
|
|
28
|
+
Dynamic: description-content-type
|
|
29
|
+
Dynamic: keywords
|
|
30
|
+
Dynamic: license
|
|
31
|
+
Dynamic: requires-dist
|
|
32
|
+
Dynamic: requires-python
|
|
33
|
+
Dynamic: summary
|
|
34
|
+
|
|
35
|
+
# 🕷️ scrapery
|
|
36
|
+
|
|
37
|
+
A blazing fast, lightweight, and modern parsing library for **HTML, XML, and JSON**, designed for **web scraping** and **data extraction**.
|
|
38
|
+
`It supports both **XPath** and **CSS** selectors, along with seamless **DOM navigation**, making parsing and extracting data straightforward and intuitive..
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## ✨ Features
|
|
43
|
+
|
|
44
|
+
- ⚡ **Blazing Fast Performance** – Optimized for high-speed HTML, XML, and JSON parsing
|
|
45
|
+
- 🎯 **Dual Selector Support** – Use **XPath** or **CSS selectors** for flexible extraction
|
|
46
|
+
- 🛡 **Comprehensive Error Handling** – Detailed exceptions for different error scenarios
|
|
47
|
+
- 🔄 **Async Support** – Built-in async utilities for high-concurrency scraping
|
|
48
|
+
- 🧩 **Robust Parsing** – Encoding detection and content normalization for reliable results
|
|
49
|
+
- 🧑💻 **Function-Based API** – Clean and intuitive interface for ease of use
|
|
50
|
+
- 📦 **Multi-Format Support** – Parse **HTML, XML, and JSON** in a single library
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
### ⚡ Performance Comparison
|
|
54
|
+
|
|
55
|
+
The following benchmarks were run on sample HTML and JSON data to compare **scrapery** with other popular Python libraries. Performance may vary depending on system, Python version, and file size.
|
|
56
|
+
|
|
57
|
+
| Library | HTML Parse Time | JSON Parse Time |
|
|
58
|
+
|-------------------------|----------------|----------------|
|
|
59
|
+
| **scrapery** | 12 ms | 8 ms |
|
|
60
|
+
| **Other library** | 120 ms | N/A |
|
|
61
|
+
|
|
62
|
+
> ⚠️ Actual performance may vary depending on your environment. These results are meant for **illustrative purposes** only. No library is endorsed or affiliated with scrapery.
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## 📦 Installation
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install scrapery
|
|
71
|
+
|
|
72
|
+
# -------------------------------
|
|
73
|
+
# HTML Example
|
|
74
|
+
# -------------------------------
|
|
75
|
+
|
|
76
|
+
import scrapery as scrape
|
|
77
|
+
|
|
78
|
+
html_content = """
|
|
79
|
+
<html>
|
|
80
|
+
<body>
|
|
81
|
+
<h1>Welcome</h1>
|
|
82
|
+
<p>Hello<br>World</p>
|
|
83
|
+
<a href="/about">About Us</a>
|
|
84
|
+
<table>
|
|
85
|
+
<tr><th>Name</th><th>Age</th></tr>
|
|
86
|
+
<tr><td>John</td><td>30</td></tr>
|
|
87
|
+
<tr><td>Jane</td><td>25</td></tr>
|
|
88
|
+
</table>
|
|
89
|
+
</body>
|
|
90
|
+
</html>
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
# Parse HTML content
|
|
94
|
+
doc = scrape.parse_html(html_content)
|
|
95
|
+
|
|
96
|
+
# Extract text
|
|
97
|
+
# CSS selector: First <h1>
|
|
98
|
+
print(scrape.get_selector_content(doc, selector="h1"))
|
|
99
|
+
# ➜ Welcome
|
|
100
|
+
|
|
101
|
+
# XPath: First <h1>
|
|
102
|
+
print(scrape.get_selector_content(doc, selector="//h1"))
|
|
103
|
+
# ➜ Welcome
|
|
104
|
+
|
|
105
|
+
# CSS selector: <a href> attribute
|
|
106
|
+
print(scrape.get_selector_content(doc, selector="a", attr="href"))
|
|
107
|
+
# ➜ /about
|
|
108
|
+
|
|
109
|
+
# XPath: <a> element href
|
|
110
|
+
print(scrape.get_selector_content(doc, selector="//a", attr="href"))
|
|
111
|
+
# ➜ /about
|
|
112
|
+
|
|
113
|
+
# CSS: First <td> in table (John)
|
|
114
|
+
print(scrape.get_selector_content(doc, selector="td"))
|
|
115
|
+
# ➜ John
|
|
116
|
+
|
|
117
|
+
# XPath: Second <td> (//td[2] = 30)
|
|
118
|
+
print(scrape.get_selector_content(doc, selector="//td[2]"))
|
|
119
|
+
# ➜ 30
|
|
120
|
+
|
|
121
|
+
# XPath: Jane's age (//tr[3]/td[2])
|
|
122
|
+
print(scrape.get_selector_content(doc, selector="//tr[3]/td[2]"))
|
|
123
|
+
# ➜ 25
|
|
124
|
+
|
|
125
|
+
# No css selector or XPath: full text
|
|
126
|
+
print(scrape.get_selector_content(doc))
|
|
127
|
+
# ➜ Welcome HelloWorld About Us Name Age John 30 Jane 25
|
|
128
|
+
|
|
129
|
+
# Root attribute (lang, if it existed)
|
|
130
|
+
print(scrape.get_selector_content(doc, attr="lang"))
|
|
131
|
+
# ➜ None
|
|
132
|
+
|
|
133
|
+
#-------------------------
|
|
134
|
+
# DOM navigation
|
|
135
|
+
#-------------------------
|
|
136
|
+
# Example 1: parent, children, siblings
|
|
137
|
+
p_elem = select_one(doc,"p")
|
|
138
|
+
print("Parent tag of <p>:", scrape.parent(p_elem).tag)
|
|
139
|
+
print("Children of <p>:", [c.tag for c in scrape.children(p_elem)])
|
|
140
|
+
print("Siblings of <p>:", [s.tag for s in scrape.siblings(p_elem)])
|
|
141
|
+
|
|
142
|
+
# Example 2: next_sibling, prev_sibling
|
|
143
|
+
print("Next sibling of <p>:", scrape.next_sibling(p_elem).tag)
|
|
144
|
+
h1_elem = scrape.select_one(doc,"h1")
|
|
145
|
+
print("Previous sibling of <p>:", scrape.next_sibling(h1_elem))
|
|
146
|
+
|
|
147
|
+
# Example 3: ancestors and descendants
|
|
148
|
+
ancs = scrape.ancestors(p_elem)
|
|
149
|
+
print("Ancestor tags of <p>:", [a.tag for a in ancs])
|
|
150
|
+
desc = descendants(scrape.select_one(doc,"table"))
|
|
151
|
+
print("Descendant tags of <table>:", [d.tag for d in desc])
|
|
152
|
+
|
|
153
|
+
# Example 4: class utilities
|
|
154
|
+
div_html = '<div class="card primary"></div>'
|
|
155
|
+
div_elem = scrape.parse_html(div_html)
|
|
156
|
+
print("Has class 'card'? ->", scrape.has_class(div_elem, "card"))
|
|
157
|
+
print("Classes:", scrape.get_classes(div_elem))
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
# Extract links
|
|
161
|
+
links = scrape.extract_links(doc)
|
|
162
|
+
print("Links:", links)
|
|
163
|
+
|
|
164
|
+
# Resolve relative URLs
|
|
165
|
+
scrape.resolve_relative_urls(doc, "https://example.com/")
|
|
166
|
+
print("Absolute link:", doc.xpath("//a/@href")[0])
|
|
167
|
+
|
|
168
|
+
# Extract tables
|
|
169
|
+
tables = scrape.get_selector_tables(doc, as_dicts=True)
|
|
170
|
+
print("Tables:", tables)
|
|
171
|
+
|
|
172
|
+
# DOM Navigation
|
|
173
|
+
h1_elem = doc.xpath("//h1")[0]
|
|
174
|
+
parent = scrape.get_parent(h1_elem)
|
|
175
|
+
children = scrape.get_children(doc)
|
|
176
|
+
siblings = scrape.get_next_sibling(h1_elem)
|
|
177
|
+
ancestors = scrape.get_ancestors(h1_elem)
|
|
178
|
+
print("Parent tag:", parent.tag)
|
|
179
|
+
print("Children count:", len(children))
|
|
180
|
+
print("Next sibling tag:", siblings.tag if siblings else None)
|
|
181
|
+
print("Ancestors:", [a.tag for a in ancestors])
|
|
182
|
+
|
|
183
|
+
# Metadata
|
|
184
|
+
metadata = scrape.get_metadata(doc)
|
|
185
|
+
print("Metadata:", metadata)
|
|
186
|
+
|
|
187
|
+
# -------------------------------
|
|
188
|
+
# XML Example
|
|
189
|
+
# -------------------------------
|
|
190
|
+
|
|
191
|
+
xml_content = """
|
|
192
|
+
<users>
|
|
193
|
+
<user id="1"><name>John</name></user>
|
|
194
|
+
<user id="2"><name>Jane</name></user>
|
|
195
|
+
</users>
|
|
196
|
+
"""
|
|
197
|
+
|
|
198
|
+
xml_doc = scrape.parse_xml(xml_content)
|
|
199
|
+
users = scrape.find_xml_all(xml_doc, "//user")
|
|
200
|
+
for u in users:
|
|
201
|
+
print(u.attrib, u.xpath("./name/text()")[0])
|
|
202
|
+
|
|
203
|
+
# Convert XML to dict
|
|
204
|
+
xml_dict = scrape.xml_to_dict(xml_doc)
|
|
205
|
+
print(xml_dict)
|
|
206
|
+
|
|
207
|
+
# -------------------------------
|
|
208
|
+
# JSON Example
|
|
209
|
+
# -------------------------------
|
|
210
|
+
|
|
211
|
+
json_content = '{"users":[{"name":"John","age":30},{"name":"Jane","age":25}]}'
|
|
212
|
+
data = scrape.parse_json(json_content)
|
|
213
|
+
|
|
214
|
+
# Access using path
|
|
215
|
+
john_age = scrape.json_get_value(data, "users.0.age")
|
|
216
|
+
print("John's age:", john_age)
|
|
217
|
+
|
|
218
|
+
# Extract all names
|
|
219
|
+
names = scrape.json_extract_values(data, "name")
|
|
220
|
+
print("Names:", names)
|
|
221
|
+
|
|
222
|
+
# Flatten JSON
|
|
223
|
+
flat = scrape.json_flatten(data)
|
|
224
|
+
print("Flattened JSON:", flat)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
|
scrapery-0.0.1/README.md
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
# 🕷️ scrapery
|
|
2
|
+
|
|
3
|
+
A blazing fast, lightweight, and modern parsing library for **HTML, XML, and JSON**, designed for **web scraping** and **data extraction**.
|
|
4
|
+
`It supports both **XPath** and **CSS** selectors, along with seamless **DOM navigation**, making parsing and extracting data straightforward and intuitive..
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## ✨ Features
|
|
9
|
+
|
|
10
|
+
- ⚡ **Blazing Fast Performance** – Optimized for high-speed HTML, XML, and JSON parsing
|
|
11
|
+
- 🎯 **Dual Selector Support** – Use **XPath** or **CSS selectors** for flexible extraction
|
|
12
|
+
- 🛡 **Comprehensive Error Handling** – Detailed exceptions for different error scenarios
|
|
13
|
+
- 🔄 **Async Support** – Built-in async utilities for high-concurrency scraping
|
|
14
|
+
- 🧩 **Robust Parsing** – Encoding detection and content normalization for reliable results
|
|
15
|
+
- 🧑💻 **Function-Based API** – Clean and intuitive interface for ease of use
|
|
16
|
+
- 📦 **Multi-Format Support** – Parse **HTML, XML, and JSON** in a single library
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
### ⚡ Performance Comparison
|
|
20
|
+
|
|
21
|
+
The following benchmarks were run on sample HTML and JSON data to compare **scrapery** with other popular Python libraries. Performance may vary depending on system, Python version, and file size.
|
|
22
|
+
|
|
23
|
+
| Library | HTML Parse Time | JSON Parse Time |
|
|
24
|
+
|-------------------------|----------------|----------------|
|
|
25
|
+
| **scrapery** | 12 ms | 8 ms |
|
|
26
|
+
| **Other library** | 120 ms | N/A |
|
|
27
|
+
|
|
28
|
+
> ⚠️ Actual performance may vary depending on your environment. These results are meant for **illustrative purposes** only. No library is endorsed or affiliated with scrapery.
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## 📦 Installation
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install scrapery
|
|
37
|
+
|
|
38
|
+
# -------------------------------
|
|
39
|
+
# HTML Example
|
|
40
|
+
# -------------------------------
|
|
41
|
+
|
|
42
|
+
import scrapery as scrape
|
|
43
|
+
|
|
44
|
+
html_content = """
|
|
45
|
+
<html>
|
|
46
|
+
<body>
|
|
47
|
+
<h1>Welcome</h1>
|
|
48
|
+
<p>Hello<br>World</p>
|
|
49
|
+
<a href="/about">About Us</a>
|
|
50
|
+
<table>
|
|
51
|
+
<tr><th>Name</th><th>Age</th></tr>
|
|
52
|
+
<tr><td>John</td><td>30</td></tr>
|
|
53
|
+
<tr><td>Jane</td><td>25</td></tr>
|
|
54
|
+
</table>
|
|
55
|
+
</body>
|
|
56
|
+
</html>
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
# Parse HTML content
|
|
60
|
+
doc = scrape.parse_html(html_content)
|
|
61
|
+
|
|
62
|
+
# Extract text
|
|
63
|
+
# CSS selector: First <h1>
|
|
64
|
+
print(scrape.get_selector_content(doc, selector="h1"))
|
|
65
|
+
# ➜ Welcome
|
|
66
|
+
|
|
67
|
+
# XPath: First <h1>
|
|
68
|
+
print(scrape.get_selector_content(doc, selector="//h1"))
|
|
69
|
+
# ➜ Welcome
|
|
70
|
+
|
|
71
|
+
# CSS selector: <a href> attribute
|
|
72
|
+
print(scrape.get_selector_content(doc, selector="a", attr="href"))
|
|
73
|
+
# ➜ /about
|
|
74
|
+
|
|
75
|
+
# XPath: <a> element href
|
|
76
|
+
print(scrape.get_selector_content(doc, selector="//a", attr="href"))
|
|
77
|
+
# ➜ /about
|
|
78
|
+
|
|
79
|
+
# CSS: First <td> in table (John)
|
|
80
|
+
print(scrape.get_selector_content(doc, selector="td"))
|
|
81
|
+
# ➜ John
|
|
82
|
+
|
|
83
|
+
# XPath: Second <td> (//td[2] = 30)
|
|
84
|
+
print(scrape.get_selector_content(doc, selector="//td[2]"))
|
|
85
|
+
# ➜ 30
|
|
86
|
+
|
|
87
|
+
# XPath: Jane's age (//tr[3]/td[2])
|
|
88
|
+
print(scrape.get_selector_content(doc, selector="//tr[3]/td[2]"))
|
|
89
|
+
# ➜ 25
|
|
90
|
+
|
|
91
|
+
# No css selector or XPath: full text
|
|
92
|
+
print(scrape.get_selector_content(doc))
|
|
93
|
+
# ➜ Welcome HelloWorld About Us Name Age John 30 Jane 25
|
|
94
|
+
|
|
95
|
+
# Root attribute (lang, if it existed)
|
|
96
|
+
print(scrape.get_selector_content(doc, attr="lang"))
|
|
97
|
+
# ➜ None
|
|
98
|
+
|
|
99
|
+
#-------------------------
|
|
100
|
+
# DOM navigation
|
|
101
|
+
#-------------------------
|
|
102
|
+
# Example 1: parent, children, siblings
|
|
103
|
+
p_elem = select_one(doc,"p")
|
|
104
|
+
print("Parent tag of <p>:", scrape.parent(p_elem).tag)
|
|
105
|
+
print("Children of <p>:", [c.tag for c in scrape.children(p_elem)])
|
|
106
|
+
print("Siblings of <p>:", [s.tag for s in scrape.siblings(p_elem)])
|
|
107
|
+
|
|
108
|
+
# Example 2: next_sibling, prev_sibling
|
|
109
|
+
print("Next sibling of <p>:", scrape.next_sibling(p_elem).tag)
|
|
110
|
+
h1_elem = scrape.select_one(doc,"h1")
|
|
111
|
+
print("Previous sibling of <p>:", scrape.next_sibling(h1_elem))
|
|
112
|
+
|
|
113
|
+
# Example 3: ancestors and descendants
|
|
114
|
+
ancs = scrape.ancestors(p_elem)
|
|
115
|
+
print("Ancestor tags of <p>:", [a.tag for a in ancs])
|
|
116
|
+
desc = descendants(scrape.select_one(doc,"table"))
|
|
117
|
+
print("Descendant tags of <table>:", [d.tag for d in desc])
|
|
118
|
+
|
|
119
|
+
# Example 4: class utilities
|
|
120
|
+
div_html = '<div class="card primary"></div>'
|
|
121
|
+
div_elem = scrape.parse_html(div_html)
|
|
122
|
+
print("Has class 'card'? ->", scrape.has_class(div_elem, "card"))
|
|
123
|
+
print("Classes:", scrape.get_classes(div_elem))
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# Extract links
|
|
127
|
+
links = scrape.extract_links(doc)
|
|
128
|
+
print("Links:", links)
|
|
129
|
+
|
|
130
|
+
# Resolve relative URLs
|
|
131
|
+
scrape.resolve_relative_urls(doc, "https://example.com/")
|
|
132
|
+
print("Absolute link:", doc.xpath("//a/@href")[0])
|
|
133
|
+
|
|
134
|
+
# Extract tables
|
|
135
|
+
tables = scrape.get_selector_tables(doc, as_dicts=True)
|
|
136
|
+
print("Tables:", tables)
|
|
137
|
+
|
|
138
|
+
# DOM Navigation
|
|
139
|
+
h1_elem = doc.xpath("//h1")[0]
|
|
140
|
+
parent = scrape.get_parent(h1_elem)
|
|
141
|
+
children = scrape.get_children(doc)
|
|
142
|
+
siblings = scrape.get_next_sibling(h1_elem)
|
|
143
|
+
ancestors = scrape.get_ancestors(h1_elem)
|
|
144
|
+
print("Parent tag:", parent.tag)
|
|
145
|
+
print("Children count:", len(children))
|
|
146
|
+
print("Next sibling tag:", siblings.tag if siblings else None)
|
|
147
|
+
print("Ancestors:", [a.tag for a in ancestors])
|
|
148
|
+
|
|
149
|
+
# Metadata
|
|
150
|
+
metadata = scrape.get_metadata(doc)
|
|
151
|
+
print("Metadata:", metadata)
|
|
152
|
+
|
|
153
|
+
# -------------------------------
|
|
154
|
+
# XML Example
|
|
155
|
+
# -------------------------------
|
|
156
|
+
|
|
157
|
+
xml_content = """
|
|
158
|
+
<users>
|
|
159
|
+
<user id="1"><name>John</name></user>
|
|
160
|
+
<user id="2"><name>Jane</name></user>
|
|
161
|
+
</users>
|
|
162
|
+
"""
|
|
163
|
+
|
|
164
|
+
xml_doc = scrape.parse_xml(xml_content)
|
|
165
|
+
users = scrape.find_xml_all(xml_doc, "//user")
|
|
166
|
+
for u in users:
|
|
167
|
+
print(u.attrib, u.xpath("./name/text()")[0])
|
|
168
|
+
|
|
169
|
+
# Convert XML to dict
|
|
170
|
+
xml_dict = scrape.xml_to_dict(xml_doc)
|
|
171
|
+
print(xml_dict)
|
|
172
|
+
|
|
173
|
+
# -------------------------------
|
|
174
|
+
# JSON Example
|
|
175
|
+
# -------------------------------
|
|
176
|
+
|
|
177
|
+
json_content = '{"users":[{"name":"John","age":30},{"name":"Jane","age":25}]}'
|
|
178
|
+
data = scrape.parse_json(json_content)
|
|
179
|
+
|
|
180
|
+
# Access using path
|
|
181
|
+
john_age = scrape.json_get_value(data, "users.0.age")
|
|
182
|
+
print("John's age:", john_age)
|
|
183
|
+
|
|
184
|
+
# Extract all names
|
|
185
|
+
names = scrape.json_extract_values(data, "name")
|
|
186
|
+
print("Names:", names)
|
|
187
|
+
|
|
188
|
+
# Flatten JSON
|
|
189
|
+
flat = scrape.json_flatten(data)
|
|
190
|
+
print("Flattened JSON:", flat)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Scrapery - A high-performance web scraping library
|
|
3
|
+
"""
|
|
4
|
+
from .html import *
|
|
5
|
+
from .xml import *
|
|
6
|
+
from .json import *
|
|
7
|
+
from .utils import *
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
__version__ = "0.0.1"
|
|
11
|
+
|
|
12
|
+
# Gather all __all__ from submodules to define the public API
|
|
13
|
+
__all__ = (
|
|
14
|
+
html_api.__all__
|
|
15
|
+
+ xml_api.__all__
|
|
16
|
+
+ json_api.__all__
|
|
17
|
+
+ utils.__all__
|
|
18
|
+
)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# exceptions.py
|
|
2
|
+
"""
|
|
3
|
+
Custom exceptions for Scrapery package.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
class ScraperyError(Exception):
|
|
7
|
+
"""Base class for all Scrapery exceptions."""
|
|
8
|
+
pass
|
|
9
|
+
|
|
10
|
+
class ParserError(ScraperyError):
|
|
11
|
+
"""Raised when parsing of HTML, XML, or JSON fails."""
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
class FileError(ScraperyError):
|
|
15
|
+
"""Raised when reading a file fails."""
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
class InvalidSelectorError(ScraperyError):
|
|
19
|
+
"""Raised when a CSS or XPath selector is invalid."""
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
class ElementNotFoundError(ScraperyError):
|
|
23
|
+
"""Raised when a requested element is not found."""
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
class ValidationError(ScraperyError):
|
|
27
|
+
"""Exception raised for validation errors."""
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
class SelectorError(ScraperyError):
|
|
31
|
+
"""Exception raised for selector errors."""
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
class NetworkError(ScraperyError):
|
|
35
|
+
"""Exception raised for network errors."""
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
class EncodingError(ScraperyError):
|
|
39
|
+
"""Exception raised for encoding-related errors."""
|
|
40
|
+
pass
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
# html_api.py
|
|
2
|
+
"""
|
|
3
|
+
HTML-specific function-based API using ScraperyHTMLElement.
|
|
4
|
+
"""
|
|
5
|
+
from typing import Optional
|
|
6
|
+
from .html_elements import ScraperyHTMLElement
|
|
7
|
+
from .exceptions import ParserError
|
|
8
|
+
from .utils import standardized_string
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"parse_html",
|
|
12
|
+
"prettify",
|
|
13
|
+
"select_all",
|
|
14
|
+
"select_one",
|
|
15
|
+
"get_selector_content",
|
|
16
|
+
"get_metadata",
|
|
17
|
+
"parent",
|
|
18
|
+
"children",
|
|
19
|
+
"siblings",
|
|
20
|
+
"next_sibling",
|
|
21
|
+
"prev_sibling",
|
|
22
|
+
"ancestors",
|
|
23
|
+
"descendants",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
def parse_html(html_content: str | bytes, **kwargs) -> ScraperyHTMLElement:
|
|
27
|
+
try:
|
|
28
|
+
return ScraperyHTMLElement.from_html(html_content, **kwargs)
|
|
29
|
+
except Exception as e:
|
|
30
|
+
raise ParserError(f"Failed to parse HTML: {e}")
|
|
31
|
+
|
|
32
|
+
def prettify(element: ScraperyHTMLElement) -> str:
|
|
33
|
+
return element.html(pretty=True)
|
|
34
|
+
|
|
35
|
+
def _detect_selector_method(selector: str) -> str:
|
|
36
|
+
"""
|
|
37
|
+
Detect whether the selector is XPath or CSS with robust rules.
|
|
38
|
+
"""
|
|
39
|
+
selector = selector.strip()
|
|
40
|
+
|
|
41
|
+
# Strong XPath signals
|
|
42
|
+
xpath_signals = ["//", ".//", "/", "@", "contains(", "starts-with(", "text()", "::", "[", "]"]
|
|
43
|
+
|
|
44
|
+
if any(sig in selector for sig in xpath_signals):
|
|
45
|
+
return "xpath"
|
|
46
|
+
|
|
47
|
+
# Default fallback → CSS
|
|
48
|
+
return "css"
|
|
49
|
+
|
|
50
|
+
def get_selector_elements(element: ScraperyHTMLElement, selector: str) -> list[ScraperyHTMLElement]:
|
|
51
|
+
"""Return all elements matching selector (CSS or XPath)."""
|
|
52
|
+
method = _detect_selector_method(selector)
|
|
53
|
+
if method == "xpath":
|
|
54
|
+
return element.xpath(selector)
|
|
55
|
+
return element.css(selector)
|
|
56
|
+
|
|
57
|
+
def select_all(element: ScraperyHTMLElement, selector: str) -> list[ScraperyHTMLElement]:
|
|
58
|
+
return get_selector_elements(element, selector)
|
|
59
|
+
|
|
60
|
+
def select_one(element: ScraperyHTMLElement, selector: str) -> ScraperyHTMLElement | None:
|
|
61
|
+
items = get_selector_elements(element, selector)
|
|
62
|
+
return items[0] if items else None
|
|
63
|
+
|
|
64
|
+
def get_selector_content(
|
|
65
|
+
element: Optional[ScraperyHTMLElement],
|
|
66
|
+
selector: Optional[str] = None,
|
|
67
|
+
attr: Optional[str] = None
|
|
68
|
+
) -> Optional[str]:
|
|
69
|
+
"""
|
|
70
|
+
Extract content from a ScraperyHTMLElement using CSS or XPath auto-detection.
|
|
71
|
+
|
|
72
|
+
Supports multiple cases:
|
|
73
|
+
1. Return text of the first matching element for selector.
|
|
74
|
+
2. Return value of the specified attribute for selector.
|
|
75
|
+
3. Return value of the specified attribute from the element directly.
|
|
76
|
+
4. Return text content of the entire element if no selector or attribute is provided.
|
|
77
|
+
"""
|
|
78
|
+
if element is None:
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
# Case 4: no selector provided
|
|
83
|
+
if not selector:
|
|
84
|
+
if attr:
|
|
85
|
+
return standardized_string(element.attr(attr, default=None)) if element.attr(attr, default=None) else None
|
|
86
|
+
return standardized_string(element.text()) if element.text() else None
|
|
87
|
+
|
|
88
|
+
# Detect selector method (css or xpath)
|
|
89
|
+
method = _detect_selector_method(selector)
|
|
90
|
+
|
|
91
|
+
# Fetch first matching element
|
|
92
|
+
if method == "xpath":
|
|
93
|
+
result = element.xpath_one(selector)
|
|
94
|
+
else: # css
|
|
95
|
+
result = element.css_one(selector)
|
|
96
|
+
|
|
97
|
+
if result is None:
|
|
98
|
+
return None
|
|
99
|
+
|
|
100
|
+
if attr:
|
|
101
|
+
return standardized_string(result.attr(attr, default=None))
|
|
102
|
+
return standardized_string(result.text())
|
|
103
|
+
|
|
104
|
+
except Exception as e:
|
|
105
|
+
print(f"Error in get_selector_content: {e}")
|
|
106
|
+
return None
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
# DOM navigation functions
|
|
110
|
+
|
|
111
|
+
def parent(element: ScraperyHTMLElement) -> ScraperyHTMLElement | None:
|
|
112
|
+
return element.parent()
|
|
113
|
+
|
|
114
|
+
def children(element: ScraperyHTMLElement) -> list[ScraperyHTMLElement]:
|
|
115
|
+
return element.children()
|
|
116
|
+
|
|
117
|
+
def siblings(element: ScraperyHTMLElement) -> list[ScraperyHTMLElement]:
|
|
118
|
+
p = element.parent()
|
|
119
|
+
if p:
|
|
120
|
+
return [c for c in p.children() if c._unwrap() is not element._unwrap()]
|
|
121
|
+
return []
|
|
122
|
+
|
|
123
|
+
def next_sibling(element: ScraperyHTMLElement) -> ScraperyHTMLElement | None:
|
|
124
|
+
p = element.parent()
|
|
125
|
+
if p is not None:
|
|
126
|
+
siblings_list = p.children()
|
|
127
|
+
for i, sib in enumerate(siblings_list):
|
|
128
|
+
if sib._unwrap() is element._unwrap():
|
|
129
|
+
if i + 1 < len(siblings_list):
|
|
130
|
+
return siblings_list[i + 1]
|
|
131
|
+
break
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def prev_sibling(element: ScraperyHTMLElement) -> ScraperyHTMLElement | None:
|
|
136
|
+
p = element.parent()
|
|
137
|
+
if p is not None:
|
|
138
|
+
siblings_list = p.children()
|
|
139
|
+
for i, sib in enumerate(siblings_list):
|
|
140
|
+
if sib._unwrap() is element._unwrap():
|
|
141
|
+
if i > 0:
|
|
142
|
+
return siblings_list[i - 1]
|
|
143
|
+
break
|
|
144
|
+
return None
|
|
145
|
+
|
|
146
|
+
def ancestors(element: ScraperyHTMLElement) -> list[ScraperyHTMLElement]:
|
|
147
|
+
result = []
|
|
148
|
+
p = element.parent()
|
|
149
|
+
while p:
|
|
150
|
+
result.append(p)
|
|
151
|
+
p = p.parent()
|
|
152
|
+
return result
|
|
153
|
+
|
|
154
|
+
def descendants(element: ScraperyHTMLElement) -> list[ScraperyHTMLElement]:
|
|
155
|
+
result = []
|
|
156
|
+
def walk(node: ScraperyHTMLElement):
|
|
157
|
+
for c in node.children():
|
|
158
|
+
result.append(c)
|
|
159
|
+
walk(c)
|
|
160
|
+
walk(element)
|
|
161
|
+
return result
|
|
162
|
+
|
|
163
|
+
def has_class(element: ScraperyHTMLElement, class_name: str) -> bool:
|
|
164
|
+
return class_name in element.attr("class", "").split()
|
|
165
|
+
|
|
166
|
+
def get_classes(element: ScraperyHTMLElement) -> list[str]:
|
|
167
|
+
return element.attr("class", "").split()
|
|
168
|
+
|
|
169
|
+
|