wxpath 0.1.0__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wxpath-0.1.0/src/wxpath.egg-info → wxpath-0.1.1}/PKG-INFO +3 -1
- {wxpath-0.1.0 → wxpath-0.1.1}/pyproject.toml +19 -1
- {wxpath-0.1.0 → wxpath-0.1.1}/src/wxpath/cli.py +8 -3
- {wxpath-0.1.0 → wxpath-0.1.1}/src/wxpath/patches.py +7 -2
- {wxpath-0.1.0 → wxpath-0.1.1/src/wxpath.egg-info}/PKG-INFO +3 -1
- {wxpath-0.1.0 → wxpath-0.1.1}/src/wxpath.egg-info/requires.txt +3 -0
- {wxpath-0.1.0 → wxpath-0.1.1}/LICENSE +0 -0
- {wxpath-0.1.0 → wxpath-0.1.1}/README.md +0 -0
- {wxpath-0.1.0 → wxpath-0.1.1}/setup.cfg +0 -0
- {wxpath-0.1.0 → wxpath-0.1.1}/src/wxpath/__init__.py +0 -0
- {wxpath-0.1.0 → wxpath-0.1.1}/src/wxpath.egg-info/SOURCES.txt +0 -0
- {wxpath-0.1.0 → wxpath-0.1.1}/src/wxpath.egg-info/dependency_links.txt +0 -0
- {wxpath-0.1.0 → wxpath-0.1.1}/src/wxpath.egg-info/entry_points.txt +0 -0
- {wxpath-0.1.0 → wxpath-0.1.1}/src/wxpath.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: wxpath
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: wxpath - a declarative web crawler and data extractor
|
|
5
5
|
Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -14,6 +14,8 @@ Requires-Dist: aiohttp>=3.8.0
|
|
|
14
14
|
Provides-Extra: test
|
|
15
15
|
Requires-Dist: pytest>=7.0; extra == "test"
|
|
16
16
|
Requires-Dist: pytest-asyncio>=0.23; extra == "test"
|
|
17
|
+
Provides-Extra: dev
|
|
18
|
+
Requires-Dist: ruff; extra == "dev"
|
|
17
19
|
Dynamic: license-file
|
|
18
20
|
|
|
19
21
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "wxpath"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.1"
|
|
8
8
|
description = "wxpath - a declarative web crawler and data extractor"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -22,6 +22,7 @@ dependencies = [
|
|
|
22
22
|
|
|
23
23
|
[project.optional-dependencies]
|
|
24
24
|
test = ["pytest>=7.0", "pytest-asyncio>=0.23"]
|
|
25
|
+
dev = ["ruff"]
|
|
25
26
|
|
|
26
27
|
[project.scripts]
|
|
27
28
|
wxpath = "wxpath.cli:main"
|
|
@@ -36,3 +37,20 @@ package-dir = {"" = "src"}
|
|
|
36
37
|
|
|
37
38
|
[tool.setuptools.packages.find]
|
|
38
39
|
include = ["wxpath"]
|
|
40
|
+
|
|
41
|
+
[tool.ruff]
|
|
42
|
+
target-version = "py311"
|
|
43
|
+
line-length = 100
|
|
44
|
+
|
|
45
|
+
lint.select = [
|
|
46
|
+
"F", # pyflakes (unused vars, undefined names, etc.)
|
|
47
|
+
"E", # pycodestyle errors
|
|
48
|
+
"B", # flake8-bugbear (real footguns)
|
|
49
|
+
"ASYNC", # async/await correctness
|
|
50
|
+
"I", # isort rules
|
|
51
|
+
"TID", # Tidy imports
|
|
52
|
+
"ICN", # Import conventions
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
[tool.ruff.format]
|
|
56
|
+
quote-style = "single"
|
|
@@ -2,10 +2,10 @@ import argparse
|
|
|
2
2
|
import json
|
|
3
3
|
import sys
|
|
4
4
|
|
|
5
|
-
from wxpath.hooks import builtin # load default hooks
|
|
6
5
|
from wxpath.core.ops import WxStr
|
|
7
6
|
from wxpath.core.parser import parse_wxpath_expr
|
|
8
|
-
from wxpath.core.runtime.engine import
|
|
7
|
+
from wxpath.core.runtime.engine import WXPathEngine, wxpath_async_blocking_iter
|
|
8
|
+
from wxpath.hooks import builtin # noqa: F401
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def _simplify(obj):
|
|
@@ -39,7 +39,12 @@ def main():
|
|
|
39
39
|
parser.add_argument("--verbose", action="store_true", help="Verbose mode")
|
|
40
40
|
|
|
41
41
|
parser.add_argument("--concurrency", type=int, default=16, help="Number of concurrent fetches")
|
|
42
|
-
parser.add_argument(
|
|
42
|
+
parser.add_argument(
|
|
43
|
+
"--concurrency-per-host",
|
|
44
|
+
type=int,
|
|
45
|
+
default=8,
|
|
46
|
+
help="Number of concurrent fetches per host"
|
|
47
|
+
)
|
|
43
48
|
|
|
44
49
|
args = parser.parse_args()
|
|
45
50
|
|
|
@@ -4,7 +4,9 @@ from lxml import etree, html
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def html_element_repr(self):
|
|
7
|
-
return f"HtmlElement(tag={self.tag},
|
|
7
|
+
return (f"HtmlElement(tag={self.tag}, "
|
|
8
|
+
f"depth={self.get('depth', -1)}, "
|
|
9
|
+
f"base_url={getattr(self, 'base_url', None)!r})")
|
|
8
10
|
|
|
9
11
|
# Patch lxml.html.HtmlElement.__repr__ to improve debugging with base_url.
|
|
10
12
|
html.HtmlElement.__repr__ = html_element_repr
|
|
@@ -17,7 +19,10 @@ class XPath3Element(etree.ElementBase):
|
|
|
17
19
|
returning the results as a list.
|
|
18
20
|
"""
|
|
19
21
|
kwargs.setdefault("parser", XPath3Parser)
|
|
20
|
-
kwargs.setdefault(
|
|
22
|
+
kwargs.setdefault(
|
|
23
|
+
"uri",
|
|
24
|
+
getattr(self.getroottree().docinfo, "URL", None) or self.get("base_url")
|
|
25
|
+
)
|
|
21
26
|
return elementpath.select(self, expr, **kwargs)
|
|
22
27
|
|
|
23
28
|
# --- Convenience property for backward‑compatibility -----------------
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: wxpath
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: wxpath - a declarative web crawler and data extractor
|
|
5
5
|
Author-email: Rodrigo Palacios <rodrigopala91@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -14,6 +14,8 @@ Requires-Dist: aiohttp>=3.8.0
|
|
|
14
14
|
Provides-Extra: test
|
|
15
15
|
Requires-Dist: pytest>=7.0; extra == "test"
|
|
16
16
|
Requires-Dist: pytest-asyncio>=0.23; extra == "test"
|
|
17
|
+
Provides-Extra: dev
|
|
18
|
+
Requires-Dist: ruff; extra == "dev"
|
|
17
19
|
Dynamic: license-file
|
|
18
20
|
|
|
19
21
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|