scrapling 0.2.99__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. scrapling/__init__.py +18 -31
  2. scrapling/cli.py +818 -20
  3. scrapling/core/_html_utils.py +348 -0
  4. scrapling/core/_types.py +34 -17
  5. scrapling/core/ai.py +611 -0
  6. scrapling/core/custom_types.py +183 -100
  7. scrapling/core/mixins.py +27 -19
  8. scrapling/core/shell.py +647 -0
  9. scrapling/core/{storage_adaptors.py → storage.py} +41 -33
  10. scrapling/core/translator.py +20 -26
  11. scrapling/core/utils.py +49 -54
  12. scrapling/engines/__init__.py +15 -6
  13. scrapling/engines/_browsers/__init__.py +2 -0
  14. scrapling/engines/_browsers/_camoufox.py +759 -0
  15. scrapling/engines/_browsers/_config_tools.py +130 -0
  16. scrapling/engines/_browsers/_controllers.py +644 -0
  17. scrapling/engines/_browsers/_page.py +93 -0
  18. scrapling/engines/_browsers/_validators.py +170 -0
  19. scrapling/engines/constants.py +101 -88
  20. scrapling/engines/static.py +667 -110
  21. scrapling/engines/toolbelt/__init__.py +20 -6
  22. scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
  23. scrapling/engines/toolbelt/convertor.py +254 -0
  24. scrapling/engines/toolbelt/custom.py +158 -175
  25. scrapling/engines/toolbelt/fingerprints.py +32 -46
  26. scrapling/engines/toolbelt/navigation.py +68 -39
  27. scrapling/fetchers.py +239 -333
  28. scrapling/parser.py +781 -449
  29. scrapling-0.3.1.dist-info/METADATA +411 -0
  30. scrapling-0.3.1.dist-info/RECORD +41 -0
  31. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/WHEEL +1 -1
  32. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/top_level.txt +0 -1
  33. scrapling/defaults.py +0 -25
  34. scrapling/engines/camo.py +0 -339
  35. scrapling/engines/pw.py +0 -465
  36. scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
  37. scrapling-0.2.99.dist-info/METADATA +0 -290
  38. scrapling-0.2.99.dist-info/RECORD +0 -49
  39. tests/__init__.py +0 -1
  40. tests/fetchers/__init__.py +0 -1
  41. tests/fetchers/async/__init__.py +0 -0
  42. tests/fetchers/async/test_camoufox.py +0 -97
  43. tests/fetchers/async/test_httpx.py +0 -85
  44. tests/fetchers/async/test_playwright.py +0 -101
  45. tests/fetchers/sync/__init__.py +0 -0
  46. tests/fetchers/sync/test_camoufox.py +0 -70
  47. tests/fetchers/sync/test_httpx.py +0 -84
  48. tests/fetchers/sync/test_playwright.py +0 -89
  49. tests/fetchers/test_utils.py +0 -97
  50. tests/parser/__init__.py +0 -0
  51. tests/parser/test_automatch.py +0 -111
  52. tests/parser/test_general.py +0 -330
  53. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/entry_points.txt +0 -0
  54. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,348 @@
1
+ """
2
+ This file is mostly copied from the submodule `w3lib.html` source code to stop downloading the whole library to use a small part of it.
3
+ So the goal of doing this is to minimize the memory footprint and keep the library size relatively smaller.
4
+ Repo source code: https://github.com/scrapy/w3lib/blob/master/w3lib/html.py
5
+ """
6
+
7
+ from re import compile as _re_compile, IGNORECASE
8
+
9
+ from scrapling.core._types import Iterable, Optional, Match, StrOrBytes
10
+
11
+ _ent_re = _re_compile(
12
+ r"&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)",
13
+ IGNORECASE,
14
+ )
15
+ # maps HTML4 entity name to the Unicode code point
16
+ name2codepoint = {
17
+ "AElig": 0x00C6, # latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1
18
+ "Aacute": 0x00C1, # latin capital letter A with acute, U+00C1 ISOlat1
19
+ "Acirc": 0x00C2, # latin capital letter A with circumflex, U+00C2 ISOlat1
20
+ "Agrave": 0x00C0, # latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1
21
+ "Alpha": 0x0391, # greek capital letter alpha, U+0391
22
+ "Aring": 0x00C5, # latin capital letter A with the ring above = latin capital letter A ring, U+00C5 ISOlat1
23
+ "Atilde": 0x00C3, # latin capital letter A with tilde, U+00C3 ISOlat1
24
+ "Auml": 0x00C4, # latin capital letter A with diaeresis, U+00C4 ISOlat1
25
+ "Beta": 0x0392, # greek capital letter beta, U+0392
26
+ "Ccedil": 0x00C7, # latin capital letter C with cedilla, U+00C7 ISOlat1
27
+ "Chi": 0x03A7, # greek capital letter chi, U+03A7
28
+ "Dagger": 0x2021, # double dagger, U+2021 ISOpub
29
+ "Delta": 0x0394, # greek capital letter delta, U+0394 ISOgrk3
30
+ "ETH": 0x00D0, # latin capital letter ETH, U+00D0 ISOlat1
31
+ "Eacute": 0x00C9, # latin capital letter E with acute, U+00C9 ISOlat1
32
+ "Ecirc": 0x00CA, # latin capital letter E with circumflex, U+00CA ISOlat1
33
+ "Egrave": 0x00C8, # latin capital letter E with grave, U+00C8 ISOlat1
34
+ "Epsilon": 0x0395, # greek capital letter epsilon, U+0395
35
+ "Eta": 0x0397, # greek capital letter eta, U+0397
36
+ "Euml": 0x00CB, # latin capital letter E with diaeresis, U+00CB ISOlat1
37
+ "Gamma": 0x0393, # greek capital letter gamma, U+0393 ISOgrk3
38
+ "Iacute": 0x00CD, # latin capital letter I with acute, U+00CD ISOlat1
39
+ "Icirc": 0x00CE, # latin capital letter I with circumflex, U+00CE ISOlat1
40
+ "Igrave": 0x00CC, # latin capital letter I with grave, U+00CC ISOlat1
41
+ "Iota": 0x0399, # greek capital letter iota, U+0399
42
+ "Iuml": 0x00CF, # latin capital letter I with diaeresis, U+00CF ISOlat1
43
+ "Kappa": 0x039A, # greek capital letter kappa, U+039A
44
+ "Lambda": 0x039B, # greek capital letter lambda, U+039B ISOgrk3
45
+ "Mu": 0x039C, # greek capital letter mu, U+039C
46
+ "Ntilde": 0x00D1, # latin capital letter N with tilde, U+00D1 ISOlat1
47
+ "Nu": 0x039D, # greek capital letter nu, U+039D
48
+ "OElig": 0x0152, # latin capital ligature OE, U+0152 ISOlat2
49
+ "Oacute": 0x00D3, # latin capital letter O with acute, U+00D3 ISOlat1
50
+ "Ocirc": 0x00D4, # latin capital letter O with circumflex, U+00D4 ISOlat1
51
+ "Ograve": 0x00D2, # latin capital letter O with grave, U+00D2 ISOlat1
52
+ "Omega": 0x03A9, # greek capital letter omega, U+03A9 ISOgrk3
53
+ "Omicron": 0x039F, # greek capital letter omicron, U+039F
54
+ "Oslash": 0x00D8, # latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1
55
+ "Otilde": 0x00D5, # latin capital letter O with tilde, U+00D5 ISOlat1
56
+ "Ouml": 0x00D6, # latin capital letter O with diaeresis, U+00D6 ISOlat1
57
+ "Phi": 0x03A6, # greek capital letter phi, U+03A6 ISOgrk3
58
+ "Pi": 0x03A0, # greek capital letter pi, U+03A0 ISOgrk3
59
+ "Prime": 0x2033, # double prime = seconds = inches, U+2033 ISOtech
60
+ "Psi": 0x03A8, # greek capital letter psi, U+03A8 ISOgrk3
61
+ "Rho": 0x03A1, # greek capital letter rho, U+03A1
62
+ "Scaron": 0x0160, # latin capital letter S with caron, U+0160 ISOlat2
63
+ "Sigma": 0x03A3, # greek capital letter sigma, U+03A3 ISOgrk3
64
+ "THORN": 0x00DE, # latin capital letter THORN, U+00DE ISOlat1
65
+ "Tau": 0x03A4, # greek capital letter tau, U+03A4
66
+ "Theta": 0x0398, # greek capital letter theta, U+0398 ISOgrk3
67
+ "Uacute": 0x00DA, # latin capital letter U with acute, U+00DA ISOlat1
68
+ "Ucirc": 0x00DB, # latin capital letter U with circumflex, U+00DB ISOlat1
69
+ "Ugrave": 0x00D9, # latin capital letter U with grave, U+00D9 ISOlat1
70
+ "Upsilon": 0x03A5, # greek capital letter upsilon, U+03A5 ISOgrk3
71
+ "Uuml": 0x00DC, # latin capital letter U with diaeresis, U+00DC ISOlat1
72
+ "Xi": 0x039E, # greek capital letter xi, U+039E ISOgrk3
73
+ "Yacute": 0x00DD, # latin capital letter Y with acute, U+00DD ISOlat1
74
+ "Yuml": 0x0178, # latin capital letter Y with diaeresis, U+0178 ISOlat2
75
+ "Zeta": 0x0396, # greek capital letter zeta, U+0396
76
+ "aacute": 0x00E1, # latin small letter a with acute, U+00E1 ISOlat1
77
+ "acirc": 0x00E2, # latin small letter a with circumflex, U+00E2 ISOlat1
78
+ "acute": 0x00B4, # acute accent = spacing acute, U+00B4 ISOdia
79
+ "aelig": 0x00E6, # latin small letter ae = latin small ligature ae, U+00E6 ISOlat1
80
+ "agrave": 0x00E0, # latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1
81
+ "alefsym": 0x2135, # alef symbol = first transfinite cardinal, U+2135 NEW
82
+ "alpha": 0x03B1, # greek small letter alpha, U+03B1 ISOgrk3
83
+ "amp": 0x0026, # ampersand, U+0026 ISOnum
84
+ "and": 0x2227, # logical and = wedge, U+2227 ISOtech
85
+ "ang": 0x2220, # angle, U+2220 ISOamso
86
+ "aring": 0x00E5, # latin small letter a with the ring above = latin small letter a ring, U+00E5 ISOlat1
87
+ "asymp": 0x2248, # almost equal to = asymptotic to, U+2248 ISOamsr
88
+ "atilde": 0x00E3, # latin small letter a with tilde, U+00E3 ISOlat1
89
+ "auml": 0x00E4, # latin small letter a with diaeresis, U+00E4 ISOlat1
90
+ "bdquo": 0x201E, # double low-9 quotation mark, U+201E NEW
91
+ "beta": 0x03B2, # greek small letter beta, U+03B2 ISOgrk3
92
+ "brvbar": 0x00A6, # broken bar = broken vertical bar, U+00A6 ISOnum
93
+ "bull": 0x2022, # bullet = black small circle, U+2022 ISOpub
94
+ "cap": 0x2229, # intersection = cap, U+2229 ISOtech
95
+ "ccedil": 0x00E7, # latin small letter c with cedilla, U+00E7 ISOlat1
96
+ "cedil": 0x00B8, # cedilla = spacing cedilla, U+00B8 ISOdia
97
+ "cent": 0x00A2, # cent sign, U+00A2 ISOnum
98
+ "chi": 0x03C7, # greek small letter chi, U+03C7 ISOgrk3
99
+ "circ": 0x02C6, # modifier letter circumflex accent, U+02C6 ISOpub
100
+ "clubs": 0x2663, # black club suit = shamrock, U+2663 ISOpub
101
+ "cong": 0x2245, # approximately equal to, U+2245 ISOtech
102
+ "copy": 0x00A9, # copyright sign, U+00A9 ISOnum
103
+ "crarr": 0x21B5, # downwards arrow with corner leftwards = carriage return, U+21B5 NEW
104
+ "cup": 0x222A, # union = cup, U+222A ISOtech
105
+ "curren": 0x00A4, # currency sign, U+00A4 ISOnum
106
+ "dArr": 0x21D3, # downwards double arrow, U+21D3 ISOamsa
107
+ "dagger": 0x2020, # dagger, U+2020 ISOpub
108
+ "darr": 0x2193, # downwards arrow, U+2193 ISOnum
109
+ "deg": 0x00B0, # degree sign, U+00B0 ISOnum
110
+ "delta": 0x03B4, # greek small letter delta, U+03B4 ISOgrk3
111
+ "diams": 0x2666, # black diamond suit, U+2666 ISOpub
112
+ "divide": 0x00F7, # division sign, U+00F7 ISOnum
113
+ "eacute": 0x00E9, # latin small letter e with acute, U+00E9 ISOlat1
114
+ "ecirc": 0x00EA, # latin small letter e with circumflex, U+00EA ISOlat1
115
+ "egrave": 0x00E8, # latin small letter e with grave, U+00E8 ISOlat1
116
+ "empty": 0x2205, # empty set = null set = diameter, U+2205 ISOamso
117
+ "emsp": 0x2003, # em space, U+2003 ISOpub
118
+ "ensp": 0x2002, # en space, U+2002 ISOpub
119
+ "epsilon": 0x03B5, # greek small letter epsilon, U+03B5 ISOgrk3
120
+ "equiv": 0x2261, # identical to, U+2261 ISOtech
121
+ "eta": 0x03B7, # greek small letter eta, U+03B7 ISOgrk3
122
+ "eth": 0x00F0, # latin small letter eth, U+00F0 ISOlat1
123
+ "euml": 0x00EB, # latin small letter e with diaeresis, U+00EB ISOlat1
124
+ "euro": 0x20AC, # euro sign, U+20AC NEW
125
+ "exist": 0x2203, # there exists, U+2203 ISOtech
126
+ "fnof": 0x0192, # latin small f with hook = function = florin, U+0192 ISOtech
127
+ "forall": 0x2200, # for all, U+2200 ISOtech
128
+ "frac12": 0x00BD, # vulgar fraction one half = fraction one half, U+00BD ISOnum
129
+ "frac14": 0x00BC, # vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum
130
+ "frac34": 0x00BE, # vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum
131
+ "frasl": 0x2044, # fraction slash, U+2044 NEW
132
+ "gamma": 0x03B3, # greek small letter gamma, U+03B3 ISOgrk3
133
+ "ge": 0x2265, # greater-than or equal to, U+2265 ISOtech
134
+ "gt": 0x003E, # greater-than sign, U+003E ISOnum
135
+ "hArr": 0x21D4, # left right double arrow, U+21D4 ISOamsa
136
+ "harr": 0x2194, # left right arrow, U+2194 ISOamsa
137
+ "hearts": 0x2665, # black heart suit = valentine, U+2665 ISOpub
138
+ "hellip": 0x2026, # horizontal ellipsis = three dot leader, U+2026 ISOpub
139
+ "iacute": 0x00ED, # latin small letter i with acute, U+00ED ISOlat1
140
+ "icirc": 0x00EE, # latin small letter i with circumflex, U+00EE ISOlat1
141
+ "iexcl": 0x00A1, # inverted exclamation mark, U+00A1 ISOnum
142
+ "igrave": 0x00EC, # latin small letter i with grave, U+00EC ISOlat1
143
+ "image": 0x2111, # blackletter capital I = imaginary part, U+2111 ISOamso
144
+ "infin": 0x221E, # infinity, U+221E ISOtech
145
+ "int": 0x222B, # integral, U+222B ISOtech
146
+ "iota": 0x03B9, # greek small letter iota, U+03B9 ISOgrk3
147
+ "iquest": 0x00BF, # inverted question mark = turned question mark, U+00BF ISOnum
148
+ "isin": 0x2208, # element of, U+2208 ISOtech
149
+ "iuml": 0x00EF, # latin small letter i with diaeresis, U+00EF ISOlat1
150
+ "kappa": 0x03BA, # greek small letter kappa, U+03BA ISOgrk3
151
+ "lArr": 0x21D0, # leftwards double arrow, U+21D0 ISOtech
152
+ "lambda": 0x03BB, # greek small letter lambda, U+03BB ISOgrk3
153
+ "lang": 0x2329, # left-pointing angle bracket = bra, U+2329 ISOtech
154
+ "laquo": 0x00AB, # left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum
155
+ "larr": 0x2190, # leftwards arrow, U+2190 ISOnum
156
+ "lceil": 0x2308, # left ceiling = apl upstile, U+2308 ISOamsc
157
+ "ldquo": 0x201C, # left double quotation mark, U+201C ISOnum
158
+ "le": 0x2264, # less-than or equal to, U+2264 ISOtech
159
+ "lfloor": 0x230A, # left floor = apl downstile, U+230A ISOamsc
160
+ "lowast": 0x2217, # asterisk operator, U+2217 ISOtech
161
+ "loz": 0x25CA, # lozenge, U+25CA ISOpub
162
+ "lrm": 0x200E, # left-to-right mark, U+200E NEW RFC 2070
163
+ "lsaquo": 0x2039, # single left-pointing angle quotation mark, U+2039 ISO proposed
164
+ "lsquo": 0x2018, # left single quotation mark, U+2018 ISOnum
165
+ "lt": 0x003C, # less-than sign, U+003C ISOnum
166
+ "macr": 0x00AF, # macron = spacing macron = overline = APL overbar, U+00AF ISOdia
167
+ "mdash": 0x2014, # em dash, U+2014 ISOpub
168
+ "micro": 0x00B5, # micro sign, U+00B5 ISOnum
169
+ "middot": 0x00B7, # middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum
170
+ "minus": 0x2212, # minus sign, U+2212 ISOtech
171
+ "mu": 0x03BC, # greek small letter mu, U+03BC ISOgrk3
172
+ "nabla": 0x2207, # nabla = backward difference, U+2207 ISOtech
173
+ "nbsp": 0x00A0, # no-break space = non-breaking space, U+00A0 ISOnum
174
+ "ndash": 0x2013, # en dash, U+2013 ISOpub
175
+ "ne": 0x2260, # not equal to, U+2260 ISOtech
176
+ "ni": 0x220B, # contains as member, U+220B ISOtech
177
+ "not": 0x00AC, # not sign, U+00AC ISOnum
178
+ "notin": 0x2209, # not an element of, U+2209 ISOtech
179
+ "nsub": 0x2284, # not a subset of, U+2284 ISOamsn
180
+ "ntilde": 0x00F1, # latin small letter n with tilde, U+00F1 ISOlat1
181
+ "nu": 0x03BD, # greek small letter nu, U+03BD ISOgrk3
182
+ "oacute": 0x00F3, # latin small letter o with acute, U+00F3 ISOlat1
183
+ "ocirc": 0x00F4, # latin small letter o with circumflex, U+00F4 ISOlat1
184
+ "oelig": 0x0153, # latin small ligature oe, U+0153 ISOlat2
185
+ "ograve": 0x00F2, # latin small letter o with grave, U+00F2 ISOlat1
186
+ "oline": 0x203E, # overline = spacing overscore, U+203E NEW
187
+ "omega": 0x03C9, # greek small letter omega, U+03C9 ISOgrk3
188
+ "omicron": 0x03BF, # greek small letter omicron, U+03BF NEW
189
+ "oplus": 0x2295, # circled plus = direct sum, U+2295 ISOamsb
190
+ "or": 0x2228, # logical or = vee, U+2228 ISOtech
191
+ "ordf": 0x00AA, # feminine ordinal indicator, U+00AA ISOnum
192
+ "ordm": 0x00BA, # masculine ordinal indicator, U+00BA ISOnum
193
+ "oslash": 0x00F8, # latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1
194
+ "otilde": 0x00F5, # latin small letter o with tilde, U+00F5 ISOlat1
195
+ "otimes": 0x2297, # circled times = vector product, U+2297 ISOamsb
196
+ "ouml": 0x00F6, # latin small letter o with diaeresis, U+00F6 ISOlat1
197
+ "para": 0x00B6, # pilcrow sign = paragraph sign, U+00B6 ISOnum
198
+ "part": 0x2202, # partial differential, U+2202 ISOtech
199
+ "permil": 0x2030, # per mille sign, U+2030 ISOtech
200
+ "perp": 0x22A5, # up tack = orthogonal to = perpendicular, U+22A5 ISOtech
201
+ "phi": 0x03C6, # greek small letter phi, U+03C6 ISOgrk3
202
+ "pi": 0x03C0, # greek small letter pi, U+03C0 ISOgrk3
203
+ "piv": 0x03D6, # greek pi symbol, U+03D6 ISOgrk3
204
+ "plusmn": 0x00B1, # plus-minus sign = plus-or-minus sign, U+00B1 ISOnum
205
+ "pound": 0x00A3, # pound sign, U+00A3 ISOnum
206
+ "prime": 0x2032, # prime = minutes = feet, U+2032 ISOtech
207
+ "prod": 0x220F, # n-ary product = product sign, U+220F ISOamsb
208
+ "prop": 0x221D, # proportional to, U+221D ISOtech
209
+ "psi": 0x03C8, # greek small letter psi, U+03C8 ISOgrk3
210
+ "quot": 0x0022, # quotation mark = APL quote, U+0022 ISOnum
211
+ "rArr": 0x21D2, # rightwards double arrow, U+21D2 ISOtech
212
+ "radic": 0x221A, # square root = radical sign, U+221A ISOtech
213
+ "rang": 0x232A, # right-pointing angle bracket = ket, U+232A ISOtech
214
+ "raquo": 0x00BB, # right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum
215
+ "rarr": 0x2192, # rightwards arrow, U+2192 ISOnum
216
+ "rceil": 0x2309, # right ceiling, U+2309 ISOamsc
217
+ "rdquo": 0x201D, # right double quotation mark, U+201D ISOnum
218
+ "real": 0x211C, # blackletter capital R = real part symbol, U+211C ISOamso
219
+ "reg": 0x00AE, # registered sign = registered trade mark sign, U+00AE ISOnum
220
+ "rfloor": 0x230B, # right floor, U+230B ISOamsc
221
+ "rho": 0x03C1, # greek small letter rho, U+03C1 ISOgrk3
222
+ "rlm": 0x200F, # right-to-left mark, U+200F NEW RFC 2070
223
+ "rsaquo": 0x203A, # single right-pointing angle quotation mark, U+203A ISO proposed
224
+ "rsquo": 0x2019, # right single quotation mark, U+2019 ISOnum
225
+ "sbquo": 0x201A, # single low-9 quotation mark, U+201A NEW
226
+ "scaron": 0x0161, # latin small letter s with caron, U+0161 ISOlat2
227
+ "sdot": 0x22C5, # dot operator, U+22C5 ISOamsb
228
+ "sect": 0x00A7, # section sign, U+00A7 ISOnum
229
+ "shy": 0x00AD, # soft hyphen = discretionary hyphen, U+00AD ISOnum
230
+ "sigma": 0x03C3, # greek small letter sigma, U+03C3 ISOgrk3
231
+ "sigmaf": 0x03C2, # greek small letter final sigma, U+03C2 ISOgrk3
232
+ "sim": 0x223C, # tilde operator = varies with = similar to, U+223C ISOtech
233
+ "spades": 0x2660, # black spade suit, U+2660 ISOpub
234
+ "sub": 0x2282, # subset of, U+2282 ISOtech
235
+ "sube": 0x2286, # subset of or equal to, U+2286 ISOtech
236
+ "sum": 0x2211, # n-ary summation, U+2211 ISOamsb
237
+ "sup": 0x2283, # superset of, U+2283 ISOtech
238
+ "sup1": 0x00B9, # superscript one = superscript digit one, U+00B9 ISOnum
239
+ "sup2": 0x00B2, # superscript two = superscript digit two = squared, U+00B2 ISOnum
240
+ "sup3": 0x00B3, # superscript three = superscript digit three = cubed, U+00B3 ISOnum
241
+ "supe": 0x2287, # superset of or equal to, U+2287 ISOtech
242
+ "szlig": 0x00DF, # latin small letter sharp s = ess-zed, U+00DF ISOlat1
243
+ "tau": 0x03C4, # greek small letter tau, U+03C4 ISOgrk3
244
+ "there4": 0x2234, # therefore, U+2234 ISOtech
245
+ "theta": 0x03B8, # greek small letter theta, U+03B8 ISOgrk3
246
+ "thetasym": 0x03D1, # greek small letter theta symbol, U+03D1 NEW
247
+ "thinsp": 0x2009, # thin space, U+2009 ISOpub
248
+ "thorn": 0x00FE, # latin small letter thorn with, U+00FE ISOlat1
249
+ "tilde": 0x02DC, # small tilde, U+02DC ISOdia
250
+ "times": 0x00D7, # multiplication sign, U+00D7 ISOnum
251
+ "trade": 0x2122, # trade mark sign, U+2122 ISOnum
252
+ "uArr": 0x21D1, # upwards double arrow, U+21D1 ISOamsa
253
+ "uacute": 0x00FA, # latin small letter u with acute, U+00FA ISOlat1
254
+ "uarr": 0x2191, # upwards arrow, U+2191 ISOnum
255
+ "ucirc": 0x00FB, # latin small letter u with circumflex, U+00FB ISOlat1
256
+ "ugrave": 0x00F9, # latin small letter u with grave, U+00F9 ISOlat1
257
+ "uml": 0x00A8, # diaeresis = spacing diaeresis, U+00A8 ISOdia
258
+ "upsih": 0x03D2, # greek upsilon with hook symbol, U+03D2 NEW
259
+ "upsilon": 0x03C5, # greek small letter upsilon, U+03C5 ISOgrk3
260
+ "uuml": 0x00FC, # latin small letter u with diaeresis, U+00FC ISOlat1
261
+ "weierp": 0x2118, # script capital P = power set = Weierstrass p, U+2118 ISOamso
262
+ "xi": 0x03BE, # greek small letter xi, U+03BE ISOgrk3
263
+ "yacute": 0x00FD, # latin small letter y with acute, U+00FD ISOlat1
264
+ "yen": 0x00A5, # yen sign = yuan sign, U+00A5 ISOnum
265
+ "yuml": 0x00FF, # latin small letter y with diaeresis, U+00FF ISOlat1
266
+ "zeta": 0x03B6, # greek small letter zeta, U+03B6 ISOgrk3
267
+ "zwj": 0x200D, # zero width joiner, U+200D NEW RFC 2070
268
+ "zwnj": 0x200C, # zero width non-joiner, U+200C NEW RFC 2070
269
+ }
270
+
271
+
272
+ def to_unicode(
273
+ text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict"
274
+ ) -> str:
275
+ """Return the Unicode representation of a bytes object `text`. If `text`
276
+ is already a Unicode object, return it as-is."""
277
+ if isinstance(text, str):
278
+ return text
279
+ if not isinstance(text, (bytes, str)):
280
+ raise TypeError(
281
+ f"to_unicode must receive bytes or str, got {type(text).__name__}"
282
+ )
283
+ if encoding is None:
284
+ encoding = "utf-8"
285
+ return text.decode(encoding, errors)
286
+
287
+
288
+ def _replace_entities(
289
+ text: StrOrBytes,
290
+ keep: Iterable[str] = (),
291
+ remove_illegal: bool = True,
292
+ encoding: str = "utf-8",
293
+ ) -> str:
294
+ """Remove entities from the given `text` by converting them to their
295
+ corresponding Unicode character.
296
+
297
+ `text` can be a Unicode string or a byte string encoded in the given
298
+ `encoding` (which defaults to 'utf-8').
299
+
300
+ If `keep` is passed (with a list of entity names), those entities will
301
+ be kept (they won't be removed).
302
+
303
+ It supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
304
+ and named entities (such as ``&nbsp;`` or ``&gt;``).
305
+
306
+ If `remove_illegal` is ``True``, entities that can't be converted are removed.
307
+ If `remove_illegal` is ``False``, entities that can't be converted are kept "as
308
+ is". For more information, see the tests.
309
+
310
+ Always returns a Unicode string (with the entities removed).
311
+
312
+ >>> _replace_entities(b'Price: &pound;100')
313
+ 'Price: \\xa3100'
314
+ >>> print(_replace_entities(b'Price: &pound;100'))
315
+ Price: £100
316
+ >>>
317
+
318
+ """
319
+
320
+ def convert_entity(m: Match[str]) -> str:
321
+ groups = m.groupdict()
322
+ number = None
323
+ if groups.get("dec"):
324
+ number = int(groups["dec"], 10)
325
+ elif groups.get("hex"):
326
+ number = int(groups["hex"], 16)
327
+ elif groups.get("named"):
328
+ entity_name = groups["named"]
329
+ if entity_name.lower() in keep:
330
+ return m.group(0)
331
+ number = name2codepoint.get(entity_name) or name2codepoint.get(
332
+ entity_name.lower()
333
+ )
334
+ if number is not None:
335
+ # Browsers typically
336
+ # interpret numeric character references in the 80-9F range as representing the characters mapped
337
+ # to bytes 80-9F in the Windows-1252 encoding. For more info
338
+ # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
339
+ try:
340
+ if 0x80 <= number <= 0x9F:
341
+ return bytes((number,)).decode("cp1252")
342
+ return chr(number)
343
+ except (ValueError, OverflowError): # pragma: no cover
344
+ pass
345
+
346
+ return "" if remove_illegal and groups.get("semicolon") else m.group(0)
347
+
348
+ return _ent_re.sub(convert_entity, to_unicode(text, encoding))
scrapling/core/_types.py CHANGED
@@ -2,26 +2,43 @@
2
2
  Type definitions for type checking purposes.
3
3
  """
4
4
 
5
- from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
6
- List, Literal, Optional, Pattern, Tuple, Type, TypeVar,
7
- Union)
5
+ from typing import (
6
+ TYPE_CHECKING,
7
+ cast,
8
+ overload,
9
+ Any,
10
+ Callable,
11
+ Dict,
12
+ Generator,
13
+ Iterable,
14
+ List,
15
+ Literal,
16
+ Optional,
17
+ Pattern,
18
+ Tuple,
19
+ TypeVar,
20
+ Union,
21
+ Match,
22
+ Mapping,
23
+ Awaitable,
24
+ Protocol,
25
+ SupportsIndex,
26
+ )
8
27
 
28
+ SUPPORTED_HTTP_METHODS = Literal["GET", "POST", "PUT", "DELETE"]
9
29
  SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
30
+ PageLoadStates = Literal["commit", "domcontentloaded", "load", "networkidle"]
31
+ extraction_types = Literal["text", "html", "markdown"]
32
+ StrOrBytes = Union[str, bytes]
10
33
 
11
- try:
12
- from typing import Protocol
13
- except ImportError:
14
- # Added in Python 3.8
15
- Protocol = object
16
34
 
17
35
  try:
18
- from typing import SupportsIndex
19
- except ImportError:
20
- # 'SupportsIndex' got added in Python 3.8
21
- SupportsIndex = None
36
+ # Python 3.11+
37
+ from typing import Self # novermin
38
+ except ImportError: # pragma: no cover
39
+ try:
40
+ from typing_extensions import Self # Backport
41
+ except ImportError:
42
+ from typing import TypeVar
22
43
 
23
- if TYPE_CHECKING:
24
- # typing.Self requires Python 3.11
25
- from typing_extensions import Self
26
- else:
27
- Self = object
44
+ Self = object