selectolax 0.3.33__cp39-cp39-win_amd64.whl → 0.3.34__cp39-cp39-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- selectolax/__init__.py +1 -1
- selectolax/lexbor/node.pxi +6 -0
- selectolax/lexbor.c +1942 -1928
- selectolax/lexbor.cp39-win_amd64.pyd +0 -0
- selectolax/lexbor.pyi +12 -0
- selectolax/lexbor.pyx +5 -0
- selectolax/parser.c +54 -40
- selectolax/parser.cp39-win_amd64.pyd +0 -0
- selectolax-0.3.34.dist-info/METADATA +32 -0
- {selectolax-0.3.33.dist-info → selectolax-0.3.34.dist-info}/RECORD +13 -13
- selectolax-0.3.33.dist-info/METADATA +0 -187
- {selectolax-0.3.33.dist-info → selectolax-0.3.34.dist-info}/WHEEL +0 -0
- {selectolax-0.3.33.dist-info → selectolax-0.3.34.dist-info}/licenses/LICENSE +0 -0
- {selectolax-0.3.33.dist-info → selectolax-0.3.34.dist-info}/top_level.txt +0 -0
|
Binary file
|
selectolax/lexbor.pyi
CHANGED
|
@@ -145,6 +145,12 @@ class LexborNode:
|
|
|
145
145
|
Matches pattern `query` against HTML tree.
|
|
146
146
|
`CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
|
|
147
147
|
|
|
148
|
+
Special selectors:
|
|
149
|
+
|
|
150
|
+
- parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
|
|
151
|
+
- parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
|
|
152
|
+
|
|
153
|
+
|
|
148
154
|
Parameters
|
|
149
155
|
----------
|
|
150
156
|
query : str
|
|
@@ -665,6 +671,12 @@ class LexborHTMLParser:
|
|
|
665
671
|
Matches pattern `query` against HTML tree.
|
|
666
672
|
`CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
|
|
667
673
|
|
|
674
|
+
Special selectors:
|
|
675
|
+
|
|
676
|
+
- parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
|
|
677
|
+
- parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
|
|
678
|
+
|
|
679
|
+
|
|
668
680
|
Parameters
|
|
669
681
|
----------
|
|
670
682
|
query : str
|
selectolax/lexbor.pyx
CHANGED
|
@@ -169,6 +169,11 @@ cdef class LexborHTMLParser:
|
|
|
169
169
|
Matches pattern `query` against HTML tree.
|
|
170
170
|
`CSS selectors reference <https://www.w3schools.com/cssref/css_selectors.asp>`_.
|
|
171
171
|
|
|
172
|
+
Special selectors:
|
|
173
|
+
|
|
174
|
+
- parser.css('p:lexbor-contains("awesome" i)') -- case-insensitive contains
|
|
175
|
+
- parser.css('p:lexbor-contains("awesome")') -- case-sensitive contains
|
|
176
|
+
|
|
172
177
|
Parameters
|
|
173
178
|
----------
|
|
174
179
|
query : str
|
selectolax/parser.c
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
/* Generated by Cython 3.1.
|
|
1
|
+
/* Generated by Cython 3.1.3 */
|
|
2
2
|
|
|
3
3
|
/* BEGIN: Cython Metadata
|
|
4
4
|
{
|
|
@@ -182,8 +182,8 @@ END: Cython Metadata */
|
|
|
182
182
|
#elif PY_VERSION_HEX < 0x03080000
|
|
183
183
|
#error Cython requires Python 3.8+.
|
|
184
184
|
#else
|
|
185
|
-
#define __PYX_ABI_VERSION "
|
|
186
|
-
#define CYTHON_HEX_VERSION
|
|
185
|
+
#define __PYX_ABI_VERSION "3_1_3"
|
|
186
|
+
#define CYTHON_HEX_VERSION 0x030103F0
|
|
187
187
|
#define CYTHON_FUTURE_DIVISION 1
|
|
188
188
|
/* CModulePreamble */
|
|
189
189
|
#include <stddef.h>
|
|
@@ -546,6 +546,9 @@ END: Cython Metadata */
|
|
|
546
546
|
enum { __pyx_check_sizeof_voidp = 1 / (int)(SIZEOF_VOID_P == sizeof(void*)) };
|
|
547
547
|
#endif
|
|
548
548
|
#endif
|
|
549
|
+
#ifndef CYTHON_LOCK_AND_GIL_DEADLOCK_AVOIDANCE_TIME
|
|
550
|
+
#define CYTHON_LOCK_AND_GIL_DEADLOCK_AVOIDANCE_TIME 100
|
|
551
|
+
#endif
|
|
549
552
|
#ifndef __has_attribute
|
|
550
553
|
#define __has_attribute(x) 0
|
|
551
554
|
#endif
|
|
@@ -2768,22 +2771,22 @@ static int __Pyx__DelItemOnTypeDict(PyTypeObject *tp, PyObject *k);
|
|
|
2768
2771
|
static int __Pyx_setup_reduce(PyObject* type_obj);
|
|
2769
2772
|
|
|
2770
2773
|
/* TypeImport.proto */
|
|
2771
|
-
#ifndef
|
|
2772
|
-
#define
|
|
2774
|
+
#ifndef __PYX_HAVE_RT_ImportType_proto_3_1_3
|
|
2775
|
+
#define __PYX_HAVE_RT_ImportType_proto_3_1_3
|
|
2773
2776
|
#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
|
|
2774
2777
|
#include <stdalign.h>
|
|
2775
2778
|
#endif
|
|
2776
2779
|
#if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || __cplusplus >= 201103L
|
|
2777
|
-
#define
|
|
2780
|
+
#define __PYX_GET_STRUCT_ALIGNMENT_3_1_3(s) alignof(s)
|
|
2778
2781
|
#else
|
|
2779
|
-
#define
|
|
2782
|
+
#define __PYX_GET_STRUCT_ALIGNMENT_3_1_3(s) sizeof(void*)
|
|
2780
2783
|
#endif
|
|
2781
|
-
enum
|
|
2782
|
-
|
|
2783
|
-
|
|
2784
|
-
|
|
2784
|
+
enum __Pyx_ImportType_CheckSize_3_1_3 {
|
|
2785
|
+
__Pyx_ImportType_CheckSize_Error_3_1_3 = 0,
|
|
2786
|
+
__Pyx_ImportType_CheckSize_Warn_3_1_3 = 1,
|
|
2787
|
+
__Pyx_ImportType_CheckSize_Ignore_3_1_3 = 2
|
|
2785
2788
|
};
|
|
2786
|
-
static PyTypeObject *
|
|
2789
|
+
static PyTypeObject *__Pyx_ImportType_3_1_3(PyObject* module, const char *module_name, const char *class_name, size_t size, size_t alignment, enum __Pyx_ImportType_CheckSize_3_1_3 check_size);
|
|
2787
2790
|
#endif
|
|
2788
2791
|
|
|
2789
2792
|
/* FetchSharedCythonModule.proto */
|
|
@@ -3567,7 +3570,7 @@ static const char __pyx_k_Node_strip_tags_line_554[] = "Node.strip_tags (line 55
|
|
|
3567
3570
|
static const char __pyx_k_Selector___reduce_cython[] = "Selector.__reduce_cython__";
|
|
3568
3571
|
static const char __pyx_k_Tag_name_cannot_be_empty[] = "Tag name cannot be empty";
|
|
3569
3572
|
static const char __pyx_k_document_no_head_no_body[] = "document_no_head_no_body";
|
|
3570
|
-
static const char __pyx_k_hk_A_1_vvxxy_881A_7_nA_1[] = "\200\001\360\006\000\005\010\200
|
|
3573
|
+
static const char __pyx_k_hk_A_1_vvxxy_881A_7_nA_1[] = "\200\001\360\006\000\005\010\200\177\220h\230k\250\033\260A\330\010\r\210^\2301\330\010\016\320\016!\320!v\320vx\320xy\330\004\023\2208\2308\2401\240A\330\004\007\200|\2207\230!\330\010*\250!\250;\260n\300A\330\004\013\2101";
|
|
3571
3574
|
static const char __pyx_k_Node_unwrap_tags_line_580[] = "Node.unwrap_tags (line 580)";
|
|
3572
3575
|
static const char __pyx_k_Node_unwrap_tags_line_768[] = "Node.unwrap_tags (line 768)";
|
|
3573
3576
|
static const char __pyx_k_any_attribute_longer_than[] = "any_attribute_longer_than";
|
|
@@ -38627,27 +38630,27 @@ static int __Pyx_modinit_type_import_code(__pyx_mstatetype *__pyx_mstate) {
|
|
|
38627
38630
|
/*--- Type import code ---*/
|
|
38628
38631
|
__pyx_t_1 = PyImport_ImportModule(__Pyx_BUILTIN_MODULE_NAME); if (unlikely(!__pyx_t_1)) __PYX_ERR(8, 8, __pyx_L1_error)
|
|
38629
38632
|
__Pyx_GOTREF(__pyx_t_1);
|
|
38630
|
-
__pyx_mstate->__pyx_ptype_7cpython_4bool_bool =
|
|
38633
|
+
__pyx_mstate->__pyx_ptype_7cpython_4bool_bool = __Pyx_ImportType_3_1_3(__pyx_t_1, __Pyx_BUILTIN_MODULE_NAME, "bool",
|
|
38631
38634
|
#if defined(PYPY_VERSION_NUM) && PYPY_VERSION_NUM < 0x050B0000
|
|
38632
|
-
sizeof(PyLongObject),
|
|
38635
|
+
sizeof(PyLongObject), __PYX_GET_STRUCT_ALIGNMENT_3_1_3(PyLongObject),
|
|
38633
38636
|
#elif CYTHON_COMPILING_IN_LIMITED_API
|
|
38634
38637
|
0, 0,
|
|
38635
38638
|
#else
|
|
38636
|
-
sizeof(PyLongObject),
|
|
38639
|
+
sizeof(PyLongObject), __PYX_GET_STRUCT_ALIGNMENT_3_1_3(PyLongObject),
|
|
38637
38640
|
#endif
|
|
38638
|
-
|
|
38641
|
+
__Pyx_ImportType_CheckSize_Warn_3_1_3); if (!__pyx_mstate->__pyx_ptype_7cpython_4bool_bool) __PYX_ERR(8, 8, __pyx_L1_error)
|
|
38639
38642
|
__Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
|
|
38640
38643
|
__pyx_t_1 = PyImport_ImportModule(__Pyx_BUILTIN_MODULE_NAME); if (unlikely(!__pyx_t_1)) __PYX_ERR(9, 9, __pyx_L1_error)
|
|
38641
38644
|
__Pyx_GOTREF(__pyx_t_1);
|
|
38642
|
-
__pyx_mstate->__pyx_ptype_7cpython_4type_type =
|
|
38645
|
+
__pyx_mstate->__pyx_ptype_7cpython_4type_type = __Pyx_ImportType_3_1_3(__pyx_t_1, __Pyx_BUILTIN_MODULE_NAME, "type",
|
|
38643
38646
|
#if defined(PYPY_VERSION_NUM) && PYPY_VERSION_NUM < 0x050B0000
|
|
38644
|
-
sizeof(PyTypeObject),
|
|
38647
|
+
sizeof(PyTypeObject), __PYX_GET_STRUCT_ALIGNMENT_3_1_3(PyTypeObject),
|
|
38645
38648
|
#elif CYTHON_COMPILING_IN_LIMITED_API
|
|
38646
38649
|
0, 0,
|
|
38647
38650
|
#else
|
|
38648
|
-
sizeof(PyHeapTypeObject),
|
|
38651
|
+
sizeof(PyHeapTypeObject), __PYX_GET_STRUCT_ALIGNMENT_3_1_3(PyHeapTypeObject),
|
|
38649
38652
|
#endif
|
|
38650
|
-
|
|
38653
|
+
__Pyx_ImportType_CheckSize_Warn_3_1_3); if (!__pyx_mstate->__pyx_ptype_7cpython_4type_type) __PYX_ERR(9, 9, __pyx_L1_error)
|
|
38651
38654
|
__Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
|
|
38652
38655
|
__Pyx_RefNannyFinishContext();
|
|
38653
38656
|
return 0;
|
|
@@ -40752,16 +40755,15 @@ static int __Pyx_InitConstants(__pyx_mstatetype *__pyx_mstate) {
|
|
|
40752
40755
|
return -1;
|
|
40753
40756
|
}
|
|
40754
40757
|
/* #### Code section: init_codeobjects ### */
|
|
40755
|
-
|
|
40756
|
-
|
|
40757
|
-
|
|
40758
|
-
|
|
40759
|
-
|
|
40760
|
-
|
|
40761
|
-
|
|
40762
|
-
|
|
40763
|
-
|
|
40764
|
-
} __Pyx_PyCode_New_function_description;
|
|
40758
|
+
typedef struct {
|
|
40759
|
+
unsigned int argcount : 3;
|
|
40760
|
+
unsigned int num_posonly_args : 1;
|
|
40761
|
+
unsigned int num_kwonly_args : 1;
|
|
40762
|
+
unsigned int nlocals : 4;
|
|
40763
|
+
unsigned int flags : 10;
|
|
40764
|
+
unsigned int first_line : 10;
|
|
40765
|
+
unsigned int line_table_length : 13;
|
|
40766
|
+
} __Pyx_PyCode_New_function_description;
|
|
40765
40767
|
/* NewCodeObj.proto */
|
|
40766
40768
|
static PyObject* __Pyx_PyCode_New(
|
|
40767
40769
|
const __Pyx_PyCode_New_function_description descr,
|
|
@@ -43842,6 +43844,13 @@ try_unpack:
|
|
|
43842
43844
|
|
|
43843
43845
|
/* PyObjectCallMethod0 */
|
|
43844
43846
|
static PyObject* __Pyx_PyObject_CallMethod0(PyObject* obj, PyObject* method_name) {
|
|
43847
|
+
#if CYTHON_VECTORCALL && (__PYX_LIMITED_VERSION_HEX >= 0x030C0000 || (!CYTHON_COMPILING_IN_LIMITED_API && PY_VERSION_HEX >= 0x03090000))
|
|
43848
|
+
PyObject *args[1] = {obj};
|
|
43849
|
+
(void) __Pyx_PyObject_GetMethod;
|
|
43850
|
+
(void) __Pyx_PyObject_CallOneArg;
|
|
43851
|
+
(void) __Pyx_PyObject_CallNoArg;
|
|
43852
|
+
return PyObject_VectorcallMethod(method_name, args, 1 | PY_VECTORCALL_ARGUMENTS_OFFSET, NULL);
|
|
43853
|
+
#else
|
|
43845
43854
|
PyObject *method = NULL, *result = NULL;
|
|
43846
43855
|
int is_method = __Pyx_PyObject_GetMethod(obj, method_name, &method);
|
|
43847
43856
|
if (likely(is_method)) {
|
|
@@ -43854,6 +43863,7 @@ static PyObject* __Pyx_PyObject_CallMethod0(PyObject* obj, PyObject* method_name
|
|
|
43854
43863
|
Py_DECREF(method);
|
|
43855
43864
|
bad:
|
|
43856
43865
|
return result;
|
|
43866
|
+
#endif
|
|
43857
43867
|
}
|
|
43858
43868
|
|
|
43859
43869
|
/* RaiseNeedMoreValuesToUnpack */
|
|
@@ -44664,6 +44674,7 @@ static int __Pyx_fix_up_extension_type_from_spec(PyType_Spec *spec, PyTypeObject
|
|
|
44664
44674
|
changed = 1;
|
|
44665
44675
|
}
|
|
44666
44676
|
#endif // CYTHON_METH_FASTCALL
|
|
44677
|
+
#if !CYTHON_COMPILING_IN_PYPY
|
|
44667
44678
|
else if (strcmp(memb->name, "__module__") == 0) {
|
|
44668
44679
|
PyObject *descr;
|
|
44669
44680
|
assert(memb->type == T_OBJECT);
|
|
@@ -44678,11 +44689,13 @@ static int __Pyx_fix_up_extension_type_from_spec(PyType_Spec *spec, PyTypeObject
|
|
|
44678
44689
|
}
|
|
44679
44690
|
changed = 1;
|
|
44680
44691
|
}
|
|
44692
|
+
#endif // !CYTHON_COMPILING_IN_PYPY
|
|
44681
44693
|
}
|
|
44682
44694
|
memb++;
|
|
44683
44695
|
}
|
|
44684
44696
|
}
|
|
44685
44697
|
#endif // !CYTHON_COMPILING_IN_LIMITED_API
|
|
44698
|
+
#if !CYTHON_COMPILING_IN_PYPY
|
|
44686
44699
|
slot = spec->slots;
|
|
44687
44700
|
while (slot && slot->slot && slot->slot != Py_tp_getset)
|
|
44688
44701
|
slot++;
|
|
@@ -44714,6 +44727,7 @@ static int __Pyx_fix_up_extension_type_from_spec(PyType_Spec *spec, PyTypeObject
|
|
|
44714
44727
|
++getset;
|
|
44715
44728
|
}
|
|
44716
44729
|
}
|
|
44730
|
+
#endif // !CYTHON_COMPILING_IN_PYPY
|
|
44717
44731
|
if (changed)
|
|
44718
44732
|
PyType_Modified(type);
|
|
44719
44733
|
#endif // PY_VERSION_HEX > 0x030900B1
|
|
@@ -45147,10 +45161,10 @@ __PYX_GOOD:
|
|
|
45147
45161
|
}
|
|
45148
45162
|
|
|
45149
45163
|
/* TypeImport */
|
|
45150
|
-
#ifndef
|
|
45151
|
-
#define
|
|
45152
|
-
static PyTypeObject *
|
|
45153
|
-
size_t size, size_t alignment, enum
|
|
45164
|
+
#ifndef __PYX_HAVE_RT_ImportType_3_1_3
|
|
45165
|
+
#define __PYX_HAVE_RT_ImportType_3_1_3
|
|
45166
|
+
static PyTypeObject *__Pyx_ImportType_3_1_3(PyObject *module, const char *module_name, const char *class_name,
|
|
45167
|
+
size_t size, size_t alignment, enum __Pyx_ImportType_CheckSize_3_1_3 check_size)
|
|
45154
45168
|
{
|
|
45155
45169
|
PyObject *result = 0;
|
|
45156
45170
|
Py_ssize_t basicsize;
|
|
@@ -45206,7 +45220,7 @@ static PyTypeObject *__Pyx_ImportType_3_1_2(PyObject *module, const char *module
|
|
|
45206
45220
|
module_name, class_name, size, basicsize+itemsize);
|
|
45207
45221
|
goto bad;
|
|
45208
45222
|
}
|
|
45209
|
-
if (check_size ==
|
|
45223
|
+
if (check_size == __Pyx_ImportType_CheckSize_Error_3_1_3 &&
|
|
45210
45224
|
((size_t)basicsize > size || (size_t)(basicsize + itemsize) < size)) {
|
|
45211
45225
|
PyErr_Format(PyExc_ValueError,
|
|
45212
45226
|
"%.200s.%.200s size changed, may indicate binary incompatibility. "
|
|
@@ -45214,7 +45228,7 @@ static PyTypeObject *__Pyx_ImportType_3_1_2(PyObject *module, const char *module
|
|
|
45214
45228
|
module_name, class_name, size, basicsize, basicsize+itemsize);
|
|
45215
45229
|
goto bad;
|
|
45216
45230
|
}
|
|
45217
|
-
else if (check_size ==
|
|
45231
|
+
else if (check_size == __Pyx_ImportType_CheckSize_Warn_3_1_3 && (size_t)basicsize > size) {
|
|
45218
45232
|
if (PyErr_WarnFormat(NULL, 0,
|
|
45219
45233
|
"%.200s.%.200s size changed, may indicate binary incompatibility. "
|
|
45220
45234
|
"Expected %zd from C header, got %zd from PyObject",
|
|
@@ -45355,7 +45369,7 @@ bad:
|
|
|
45355
45369
|
}
|
|
45356
45370
|
|
|
45357
45371
|
/* CommonTypesMetaclass */
|
|
45358
|
-
PyObject* __pyx_CommonTypesMetaclass_get_module(CYTHON_UNUSED PyObject *self, CYTHON_UNUSED void* context) {
|
|
45372
|
+
static PyObject* __pyx_CommonTypesMetaclass_get_module(CYTHON_UNUSED PyObject *self, CYTHON_UNUSED void* context) {
|
|
45359
45373
|
return PyUnicode_FromString(__PYX_ABI_MODULE_NAME);
|
|
45360
45374
|
}
|
|
45361
45375
|
static PyGetSetDef __pyx_CommonTypesMetaclass_getset[] = {
|
|
@@ -48284,7 +48298,7 @@ static CYTHON_INLINE PyObject *__Pyx_PyIter_Next_Plain(PyObject *iterator) {
|
|
|
48284
48298
|
}
|
|
48285
48299
|
|
|
48286
48300
|
/* PyObjectCallMethod1 */
|
|
48287
|
-
#if !(CYTHON_VECTORCALL && __PYX_LIMITED_VERSION_HEX >= 0x030C0000)
|
|
48301
|
+
#if !(CYTHON_VECTORCALL && (__PYX_LIMITED_VERSION_HEX >= 0x030C0000 || (!CYTHON_COMPILING_IN_LIMITED_API && PY_VERSION_HEX >= 0x03090000)))
|
|
48288
48302
|
static PyObject* __Pyx__PyObject_CallMethod1(PyObject* method, PyObject* arg) {
|
|
48289
48303
|
PyObject *result = __Pyx_PyObject_CallOneArg(method, arg);
|
|
48290
48304
|
Py_DECREF(method);
|
|
@@ -48292,7 +48306,7 @@ static PyObject* __Pyx__PyObject_CallMethod1(PyObject* method, PyObject* arg) {
|
|
|
48292
48306
|
}
|
|
48293
48307
|
#endif
|
|
48294
48308
|
static PyObject* __Pyx_PyObject_CallMethod1(PyObject* obj, PyObject* method_name, PyObject* arg) {
|
|
48295
|
-
#if CYTHON_VECTORCALL && __PYX_LIMITED_VERSION_HEX >= 0x030C0000
|
|
48309
|
+
#if CYTHON_VECTORCALL && (__PYX_LIMITED_VERSION_HEX >= 0x030C0000 || (!CYTHON_COMPILING_IN_LIMITED_API && PY_VERSION_HEX >= 0x03090000))
|
|
48296
48310
|
PyObject *args[2] = {obj, arg};
|
|
48297
48311
|
(void) __Pyx_PyObject_GetMethod;
|
|
48298
48312
|
(void) __Pyx_PyObject_CallOneArg;
|
|
Binary file
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: selectolax
|
|
3
|
+
Version: 0.3.34
|
|
4
|
+
Summary: Fast HTML5 parser with CSS selectors.
|
|
5
|
+
Home-page: https://github.com/rushter/selectolax
|
|
6
|
+
Author: Artem Golubin
|
|
7
|
+
Author-email: Artem Golubin <me@rushter.com>
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Project-URL: Repository, https://github.com/rushter/selectolax
|
|
10
|
+
Project-URL: Documentation, https://selectolax.readthedocs.io/en/latest/parser.html
|
|
11
|
+
Project-URL: Changelog, https://github.com/rushter/selectolax/blob/main/CHANGES.md
|
|
12
|
+
Keywords: selectolax,html,parser,css,fast
|
|
13
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
14
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
15
|
+
Classifier: Topic :: Internet
|
|
16
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
17
|
+
Classifier: Intended Audience :: Developers
|
|
18
|
+
Classifier: Natural Language :: English
|
|
19
|
+
Classifier: Programming Language :: Python :: 3
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
25
|
+
Requires-Python: >=3.9
|
|
26
|
+
Description-Content-Type: text/x-rst
|
|
27
|
+
License-File: LICENSE
|
|
28
|
+
Provides-Extra: cython
|
|
29
|
+
Requires-Dist: Cython; extra == "cython"
|
|
30
|
+
Dynamic: author
|
|
31
|
+
Dynamic: home-page
|
|
32
|
+
Dynamic: license-file
|
|
@@ -1,26 +1,26 @@
|
|
|
1
|
-
selectolax/__init__.py,sha256=
|
|
1
|
+
selectolax/__init__.py,sha256=iI6pQ10gimevS2gTf4K4_1cXh4NBRFj_5GjkmhrvU94,157
|
|
2
2
|
selectolax/base.pxi,sha256=zOj3BrCA71xd-mJFtkMIAglP4ZybfrHVoCoy6ljTBDQ,93
|
|
3
|
-
selectolax/lexbor.c,sha256=
|
|
4
|
-
selectolax/lexbor.cp39-win_amd64.pyd,sha256=
|
|
3
|
+
selectolax/lexbor.c,sha256=Kz7IFiUGbVTJvAH3WTwu188zD4xQm08Fs6ab6Jo6jyE,2419433
|
|
4
|
+
selectolax/lexbor.cp39-win_amd64.pyd,sha256=oyBtlI6N_kbB3PWwszUu2wMnj5QGzzkHfRszq7w2KLE,3149824
|
|
5
5
|
selectolax/lexbor.pxd,sha256=BcqAzhlUVq0GVWiJHWXNhs4jY-gi6k0BELEnQtSYJAI,21720
|
|
6
|
-
selectolax/lexbor.pyi,sha256=
|
|
7
|
-
selectolax/lexbor.pyx,sha256=
|
|
8
|
-
selectolax/parser.c,sha256=
|
|
9
|
-
selectolax/parser.cp39-win_amd64.pyd,sha256=
|
|
6
|
+
selectolax/lexbor.pyi,sha256=dRNzLXJEbFRR7QcItuX8Ews9E9I6h6G4vA3X1hijzj4,28990
|
|
7
|
+
selectolax/lexbor.pyx,sha256=XLZ2vGwLoWdctnmU-gfizjD6tMjehR_bzNOapDJ_YOQ,12891
|
|
8
|
+
selectolax/parser.c,sha256=zUJAqFbI1vy5-cjgPwJVfYassgbP7Gdnr2eRYv5D3W4,2259231
|
|
9
|
+
selectolax/parser.cp39-win_amd64.pyd,sha256=ewom3_ckmFHulhsszlIczbpzbw6m4lBNRnitepuNkJE,2105856
|
|
10
10
|
selectolax/parser.pxd,sha256=T7GoQdaOkhp_W2TBlRY0tZqom97PkHrytYaXQlyVnbI,25196
|
|
11
11
|
selectolax/parser.pyi,sha256=-qutpjrK1dD4rrl3SsHWQt2FT5lv6meaACkQzk1Bt6o,25612
|
|
12
12
|
selectolax/parser.pyx,sha256=nIWuhaEFRwlfo64WmgrSOM0A8mUw0eWw9j_fWyLV-Ro,14127
|
|
13
13
|
selectolax/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
14
|
selectolax/utils.pxi,sha256=hkzKfycdpwH1P-E_pP-9NTGsmiajt6EJNZSlkxlRapA,3623
|
|
15
15
|
selectolax/lexbor/attrs.pxi,sha256=d59V77aGkpp7YsYsd6t_z4-tRnUoQTJZKsvMC8nyttM,3978
|
|
16
|
-
selectolax/lexbor/node.pxi,sha256=
|
|
16
|
+
selectolax/lexbor/node.pxi,sha256=KODqPk3yZ_owwdSxqNr2Ih6qAOhu9CJ-jrHtqQJcWmY,33407
|
|
17
17
|
selectolax/lexbor/selection.pxi,sha256=BeUDypw5_P0CTmi-ACLcd7pK2NnG9ASrwWOdLdweAZY,7378
|
|
18
18
|
selectolax/lexbor/util.pxi,sha256=q2EYVNdnROg9y30mWpGwlNA0W00nJ7ZRNEEDrOEG14s,584
|
|
19
19
|
selectolax/modest/node.pxi,sha256=iX_yRPIPVkG0ALW7hEfmXiVperw6RjkSGATkxzLokz0,34691
|
|
20
20
|
selectolax/modest/selection.pxi,sha256=PfHUN1uuNA7YfcxTu7JZjhxevVbFRP1bHd3kyyFdO7E,6703
|
|
21
21
|
selectolax/modest/util.pxi,sha256=zab67Wzo8FcipA2VS8ClptaC19lZirbNqFEGQ3hW2Is,572
|
|
22
|
-
selectolax-0.3.
|
|
23
|
-
selectolax-0.3.
|
|
24
|
-
selectolax-0.3.
|
|
25
|
-
selectolax-0.3.
|
|
26
|
-
selectolax-0.3.
|
|
22
|
+
selectolax-0.3.34.dist-info/licenses/LICENSE,sha256=A7Jb3WZcENcLfZRc7QPdm9zJdwfpIyPodPJu-kdMH6E,1087
|
|
23
|
+
selectolax-0.3.34.dist-info/METADATA,sha256=rAqskRB9wMSn7tEZLxZswUJD4wFzN4fizyXjiBv4L4o,1318
|
|
24
|
+
selectolax-0.3.34.dist-info/WHEEL,sha256=XkFE14KmFh7mutkkb-qn_ueuH2lwfT8rLdfc5xpQ7wE,99
|
|
25
|
+
selectolax-0.3.34.dist-info/top_level.txt,sha256=e5MuEM2PrQzoDlWetkFli9uXSlxa_ktW5jJEihhaI1c,11
|
|
26
|
+
selectolax-0.3.34.dist-info/RECORD,,
|
|
@@ -1,187 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: selectolax
|
|
3
|
-
Version: 0.3.33
|
|
4
|
-
Summary: Fast HTML5 parser with CSS selectors.
|
|
5
|
-
Home-page: https://github.com/rushter/selectolax
|
|
6
|
-
Author: Artem Golubin
|
|
7
|
-
Author-email: Artem Golubin <me@rushter.com>
|
|
8
|
-
License: MIT
|
|
9
|
-
Project-URL: Repository, https://github.com/rushter/selectolax
|
|
10
|
-
Project-URL: Documentation, https://selectolax.readthedocs.io/en/latest/parser.html
|
|
11
|
-
Project-URL: Changelog, https://github.com/rushter/selectolax/blob/main/CHANGES.rst
|
|
12
|
-
Keywords: selectolax,html,parser,css,fast
|
|
13
|
-
Classifier: Development Status :: 5 - Production/Stable
|
|
14
|
-
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
15
|
-
Classifier: Topic :: Internet
|
|
16
|
-
Classifier: Topic :: Internet :: WWW/HTTP
|
|
17
|
-
Classifier: Intended Audience :: Developers
|
|
18
|
-
Classifier: Natural Language :: English
|
|
19
|
-
Classifier: Programming Language :: Python :: 3
|
|
20
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
21
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
22
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
23
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
24
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
25
|
-
Requires-Python: >=3.9
|
|
26
|
-
Description-Content-Type: text/x-rst
|
|
27
|
-
License-File: LICENSE
|
|
28
|
-
Provides-Extra: cython
|
|
29
|
-
Requires-Dist: Cython; extra == "cython"
|
|
30
|
-
Dynamic: author
|
|
31
|
-
Dynamic: home-page
|
|
32
|
-
Dynamic: license-file
|
|
33
|
-
|
|
34
|
-
.. image:: docs/logo.png
|
|
35
|
-
:alt: selectolax logo
|
|
36
|
-
|
|
37
|
-
-------------------------
|
|
38
|
-
|
|
39
|
-
.. image:: https://img.shields.io/pypi/v/selectolax.svg
|
|
40
|
-
:target: https://pypi.python.org/pypi/selectolax
|
|
41
|
-
|
|
42
|
-
A fast HTML5 parser with CSS selectors using `Modest <https://github.com/lexborisov/Modest/>`_ and
|
|
43
|
-
`Lexbor <https://github.com/lexbor/lexbor>`_ engines.
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
Installation
|
|
47
|
-
------------
|
|
48
|
-
From PyPI using pip:
|
|
49
|
-
|
|
50
|
-
.. code-block:: bash
|
|
51
|
-
|
|
52
|
-
pip install selectolax
|
|
53
|
-
|
|
54
|
-
If installation fails due to compilation errors, you may need to install `Cython <https://github.com/cython/cython>`_:
|
|
55
|
-
|
|
56
|
-
.. code-block:: bash
|
|
57
|
-
|
|
58
|
-
pip install selectolax[cython]
|
|
59
|
-
|
|
60
|
-
This usually happens when you try to install an outdated version of selectolax on a newer version of Python.
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
Development version from GitHub:
|
|
64
|
-
|
|
65
|
-
.. code-block:: bash
|
|
66
|
-
|
|
67
|
-
git clone --recursive https://github.com/rushter/selectolax
|
|
68
|
-
cd selectolax
|
|
69
|
-
pip install -r requirements_dev.txt
|
|
70
|
-
python setup.py install
|
|
71
|
-
|
|
72
|
-
How to compile selectolax while developing:
|
|
73
|
-
|
|
74
|
-
.. code-block:: bash
|
|
75
|
-
|
|
76
|
-
make clean
|
|
77
|
-
make dev
|
|
78
|
-
|
|
79
|
-
Basic examples
|
|
80
|
-
--------------
|
|
81
|
-
|
|
82
|
-
Here are some basic examples to get you started with selectolax:
|
|
83
|
-
|
|
84
|
-
Parsing HTML and extracting text:
|
|
85
|
-
|
|
86
|
-
.. code:: python
|
|
87
|
-
|
|
88
|
-
In [1]: from selectolax.parser import HTMLParser
|
|
89
|
-
...:
|
|
90
|
-
...: html = """
|
|
91
|
-
...: <h1 id="title" data-updated="20201101">Hi there</h1>
|
|
92
|
-
...: <div class="post">Lorem Ipsum is simply dummy text of the printing and typesetting industry. </div>
|
|
93
|
-
...: <div class="post">Lorem ipsum dolor sit amet, consectetur adipiscing elit.</div>
|
|
94
|
-
...: """
|
|
95
|
-
...: tree = HTMLParser(html)
|
|
96
|
-
|
|
97
|
-
In [2]: tree.css_first('h1#title').text()
|
|
98
|
-
Out[2]: 'Hi there'
|
|
99
|
-
|
|
100
|
-
In [3]: tree.css_first('h1#title').attributes
|
|
101
|
-
Out[3]: {'id': 'title', 'data-updated': '20201101'}
|
|
102
|
-
|
|
103
|
-
In [4]: [node.text() for node in tree.css('.post')]
|
|
104
|
-
Out[4]:
|
|
105
|
-
['Lorem Ipsum is simply dummy text of the printing and typesetting industry. ',
|
|
106
|
-
'Lorem ipsum dolor sit amet, consectetur adipiscing elit.']
|
|
107
|
-
|
|
108
|
-
Using advanced CSS selectors:
|
|
109
|
-
|
|
110
|
-
.. code:: python
|
|
111
|
-
|
|
112
|
-
In [1]: html = "<div><p id=p1><p id=p2><p id=p3><a>link</a><p id=p4><p id=p5>text<p id=p6></div>"
|
|
113
|
-
...: selector = "div > :nth-child(2n+1):not(:has(a))"
|
|
114
|
-
|
|
115
|
-
In [2]: for node in HTMLParser(html).css(selector):
|
|
116
|
-
...: print(node.attributes, node.text(), node.tag)
|
|
117
|
-
...: print(node.parent.tag)
|
|
118
|
-
...: print(node.html)
|
|
119
|
-
...:
|
|
120
|
-
{'id': 'p1'} p
|
|
121
|
-
div
|
|
122
|
-
<p id="p1"></p>
|
|
123
|
-
{'id': 'p5'} text p
|
|
124
|
-
div
|
|
125
|
-
<p id="p5">text</p>
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
* `Detailed overview <https://github.com/rushter/selectolax/blob/master/examples/walkthrough.ipynb>`_
|
|
129
|
-
|
|
130
|
-
Available backends
|
|
131
|
-
------------------
|
|
132
|
-
|
|
133
|
-
Selectolax supports two backends: ``Modest`` and ``Lexbor``. By default, all examples use the Modest backend.
|
|
134
|
-
Most of the features between backends are almost identical, but there are still some differences.
|
|
135
|
-
|
|
136
|
-
As of 2024, the preferred backend is ``Lexbor``. The ``Modest`` backend is still available for compatibility reasons
|
|
137
|
-
and the underlying C library that selectolax uses is not maintained anymore.
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
To use ``lexbor``, just import the parser and use it in the similar way to the `HTMLParser`.
|
|
141
|
-
|
|
142
|
-
.. code:: python
|
|
143
|
-
|
|
144
|
-
In [1]: from selectolax.lexbor import LexborHTMLParser
|
|
145
|
-
|
|
146
|
-
In [2]: html = """
|
|
147
|
-
...: <title>Hi there</title>
|
|
148
|
-
...: <div id="updated">2021-08-15</div>
|
|
149
|
-
...: """
|
|
150
|
-
|
|
151
|
-
In [3]: parser = LexborHTMLParser(html)
|
|
152
|
-
In [4]: parser.root.css_first("#updated").text()
|
|
153
|
-
Out[4]: '2021-08-15'
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
Simple Benchmark
|
|
157
|
-
----------------
|
|
158
|
-
|
|
159
|
-
* Extract title, links, scripts and a meta tag from main pages of top 754 domains. See ``examples/benchmark.py`` for more information.
|
|
160
|
-
|
|
161
|
-
============================ ===========
|
|
162
|
-
Package Time
|
|
163
|
-
============================ ===========
|
|
164
|
-
Beautiful Soup (html.parser) 61.02 sec.
|
|
165
|
-
lxml / Beautiful Soup (lxml) 9.09 sec.
|
|
166
|
-
html5_parser 16.10 sec.
|
|
167
|
-
selectolax (Modest) 2.94 sec.
|
|
168
|
-
selectolax (Lexbor) 2.39 sec.
|
|
169
|
-
============================ ===========
|
|
170
|
-
|
|
171
|
-
Links
|
|
172
|
-
-----
|
|
173
|
-
|
|
174
|
-
* `selectolax API reference <https://selectolax.readthedocs.io/en/latest/index.html>`_
|
|
175
|
-
* `Video introduction to web scraping using selectolax <https://youtu.be/HpRsfpPuUzE>`_
|
|
176
|
-
* `How to Scrape 7k Products with Python using selectolax and httpx <https://www.youtube.com/watch?v=XpGvq755J2U>`_
|
|
177
|
-
* `Detailed overview <https://github.com/rushter/selectolax/blob/master/examples/walkthrough.ipynb>`_
|
|
178
|
-
* `Modest introduction <https://lexborisov.github.io/Modest/>`_
|
|
179
|
-
* `Modest benchmark <https://lexborisov.github.io/benchmark-html-parsers/>`_
|
|
180
|
-
* `Python benchmark <https://rushter.com/blog/python-fast-html-parser/>`_
|
|
181
|
-
* `Another Python benchmark <https://www.peterbe.com/plog/selectolax-or-pyquery>`_
|
|
182
|
-
|
|
183
|
-
License
|
|
184
|
-
-------
|
|
185
|
-
|
|
186
|
-
* Modest engine — `LGPL2.1 <https://github.com/lexborisov/Modest/blob/master/LICENSE>`_
|
|
187
|
-
* selectolax - `MIT <https://github.com/rushter/selectolax/blob/master/LICENSE>`_
|
|
File without changes
|
|
File without changes
|
|
File without changes
|