ssc_codegen 0.14.0__tar.gz → 0.15.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/PKG-INFO +1 -1
  2. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/pyproject.toml +1 -1
  3. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/__init__.py +4 -3
  4. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/ast_/__init__.py +4 -1
  5. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/ast_/nodes_core.py +41 -3
  6. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/ast_build/builder.py +39 -1
  7. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/converters/go_goquery.py +31 -12
  8. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/converters/js_pure.py +32 -3
  9. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/converters/lua_htmlparser.py +32 -1
  10. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/converters/py_base.py +32 -4
  11. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/converters/py_bs4.py +30 -2
  12. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/converters/py_lxml.py +37 -25
  13. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/converters/py_parsel.py +28 -0
  14. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/converters/py_selectolax.py +28 -0
  15. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/document.py +34 -1
  16. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/schema.py +15 -1
  17. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/tokens.py +6 -0
  18. ssc_codegen-0.15.0/ssc_codegen/transform.py +95 -0
  19. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/.gitignore +0 -0
  20. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/LICENSE +0 -0
  21. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/README.md +0 -0
  22. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/_compat.py +0 -0
  23. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/ast_/base.py +0 -0
  24. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/ast_/nodes_array.py +0 -0
  25. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/ast_/nodes_cast.py +0 -0
  26. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/ast_/nodes_filter.py +0 -0
  27. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/ast_/nodes_selectors.py +0 -0
  28. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/ast_/nodes_string.py +0 -0
  29. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/ast_/nodes_validate.py +0 -0
  30. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/ast_build/__init__.py +0 -0
  31. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/ast_build/main.py +0 -0
  32. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/ast_build/utils.py +0 -0
  33. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/ast_grep_rules/js_rules.yml +0 -0
  34. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/ast_grep_rules/py_drop_prefix_suffix_backport.yml +0 -0
  35. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/ast_grep_rules/py_rules.yml +0 -0
  36. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/cli/__init__.py +0 -0
  37. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/cli/ast_grep.py +0 -0
  38. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/cli/cli_callbacks.py +0 -0
  39. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/cli/cli_utils.py +0 -0
  40. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/cli/code_callbacks.py +0 -0
  41. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/cli/consts.py +0 -0
  42. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/cli/main.py +0 -0
  43. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/cli/runtime_parse_runners.py +0 -0
  44. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/compiler.py +0 -0
  45. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/converters/__init__.py +0 -0
  46. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/converters/base.py +0 -0
  47. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/converters/helpers.py +0 -0
  48. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/converters/templates/__init__.py +0 -0
  49. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/converters/templates/go_goquery.py +0 -0
  50. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/converters/templates/js_pure.py +0 -0
  51. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/converters/templates/lua_base.py +0 -0
  52. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/converters/templates/lua_css_compat.py +0 -0
  53. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/converters/templates/lua_re_compat.py +0 -0
  54. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/converters/templates/py_base.py +0 -0
  55. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/document_utlis.py +0 -0
  56. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/json_struct.py +0 -0
  57. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/json_to_scc.py +0 -0
  58. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/logs.py +0 -0
  59. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/pseudo_selectors.py +0 -0
  60. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/selector_utils.py +0 -0
  61. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/static_checker/__init__.py +0 -0
  62. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/static_checker/base.py +0 -0
  63. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/static_checker/callbacks.py +0 -0
  64. {ssc_codegen-0.14.0 → ssc_codegen-0.15.0}/ssc_codegen/str_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ssc_codegen
3
- Version: 0.14.0
3
+ Version: 0.15.0
4
4
  Summary: Python-dsl code converter to html parser for web scraping
5
5
  Project-URL: Documentation, https://github.com/vypivshiy/selector_schema_codegen#readme
6
6
  Project-URL: Issues, https://github.com/vypivshiy/selector_schema_codegen/issues
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ssc_codegen"
3
- version = "0.14.0"
3
+ version = "0.15.0"
4
4
  description = "Python-dsl code converter to html parser for web scraping "
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -1,13 +1,13 @@
1
1
  from ssc_codegen.document import DocumentElementsFilter, HTMLDocument, ClassVarDocument, StringDocument, ArrayDocument, AssertDocument, NestedDocument, \
2
2
  DefaultDocument, \
3
- NumericDocument, JsonDocument, BooleanDocument, DocumentFilter
3
+ NumericDocument, JsonDocument, BooleanDocument, DocumentFilter, TransformDocument
4
4
  from ssc_codegen.json_struct import Json
5
5
  from ssc_codegen.logs import setup_logger
6
6
  from ssc_codegen.schema import ItemSchema, DictSchema, ListSchema, FlatListSchema, AccUniqueListSchema
7
7
 
8
8
  setup_logger()
9
9
 
10
- VERSION = "0.14.0"
10
+ VERSION = "0.15.0"
11
11
 
12
12
 
13
13
  class __MISSING(object):
@@ -26,7 +26,8 @@ class Document(
26
26
  DefaultDocument,
27
27
  NumericDocument,
28
28
  JsonDocument,
29
- BooleanDocument
29
+ BooleanDocument,
30
+ TransformDocument
30
31
  ):
31
32
  """Special Common Document or Element marker manipulations"""
32
33
  pass
@@ -38,7 +38,10 @@ from .nodes_core import (
38
38
  ExprClassVar,
39
39
  ExprCallStructClassVar,
40
40
  CodeStart,
41
- CodeEnd
41
+ CodeEnd,
42
+ ExprTransform,
43
+ ModuleTransformImports,
44
+ ModuleUtilities
42
45
  )
43
46
  from .nodes_filter import (
44
47
  FilterOr,
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass, field
2
- from typing import Sequence, TypedDict, ClassVar, TypeVar
2
+ from typing import Sequence, TypedDict, ClassVar, TypeVar, TYPE_CHECKING
3
3
 
4
4
  from ssc_codegen.ast_.base import T_EMPTY_KWARGS, BaseAstNode
5
5
  from ssc_codegen.ast_.base import (
@@ -14,6 +14,9 @@ from ssc_codegen.tokens import (
14
14
  VariableType,
15
15
  )
16
16
 
17
+ if TYPE_CHECKING:
18
+ from ssc_codegen.transform import BaseTransform
19
+
17
20
  KW_DOCSTRING = TypedDict("KW_DOCSTRING", {"value": str})
18
21
 
19
22
 
@@ -34,11 +37,32 @@ class Docstring(_DisableRepr, BaseAstNode[KW_DOCSTRING, tuple[str]]):
34
37
  """
35
38
 
36
39
  kind: ClassVar[TokenType] = TokenType.DOCSTRING
37
- kwargs: KW_DOCSTRING
40
+
41
+
42
+ # push transform for resolve imports (dependencies)
43
+ KW_MODULE_IMPORTS = TypedDict(
44
+ "KW_MODULE_IMPORTS", {"transforms": list["BaseTransform"]}
45
+ )
46
+ ARGS_MODULE_IMPORTS = tuple[list["BaseTransform"]]
47
+
48
+
49
+ @dataclass(kw_only=True)
50
+ class ModuleTransformImports(
51
+ BaseAstNode[KW_MODULE_IMPORTS, ARGS_MODULE_IMPORTS]
52
+ ):
53
+ """AST node representing module import transfrom dependencies.
54
+
55
+ This node contains import statements that will be added to the
56
+ generated module to ensure required dependencies are available.
57
+ """
58
+
59
+ kind: ClassVar[TokenType] = TokenType.TRANSFORM_IMPORTS
38
60
 
39
61
 
40
62
  @dataclass(kw_only=True)
41
- class ModuleImports(_DisableRepr, BaseAstNode[T_EMPTY_KWARGS, tuple]):
63
+ class ModuleImports(
64
+ _DisableRepr, BaseAstNode[KW_MODULE_IMPORTS, ARGS_MODULE_IMPORTS]
65
+ ):
42
66
  """AST node representing module import statements.
43
67
 
44
68
  This node contains import statements that will be added to the
@@ -551,3 +575,17 @@ class CodeEnd(_DisableRepr, BaseAstNode):
551
575
  """
552
576
 
553
577
  kind: ClassVar[TokenType] = TokenType.CODE_END
578
+
579
+
580
+ KW_TRANSFORM = TypedDict("KW_TRANSFORM", {"transform": "BaseTransform"})
581
+ ARGS_TRANSFORM = tuple["BaseTransform"]
582
+
583
+
584
+ @dataclass(kw_only=True)
585
+ class ExprTransform(BaseAstNode[KW_TRANSFORM, ARGS_TRANSFORM]):
586
+ kind: ClassVar[TokenType] = TokenType.TRANSFORM
587
+
588
+
589
+ @dataclass(kw_only=True)
590
+ class ModuleUtilities(_DisableRepr, BaseAstNode):
591
+ kind: ClassVar[TokenType] = TokenType.UTILITIES
@@ -3,6 +3,7 @@ from typing import MutableSequence, Type, cast
3
3
  import warnings
4
4
  from typing_extensions import Self
5
5
 
6
+ from ssc_codegen.ast_ import ModuleTransformImports, ModuleUtilities
6
7
  from ssc_codegen.ast_.base import BaseAstNode
7
8
  from ssc_codegen.ast_.nodes_cast import ExprJsonify, ExprNested
8
9
  from ssc_codegen.ast_.nodes_core import (
@@ -113,7 +114,16 @@ class AstBuilder:
113
114
  Docstring(kwargs={"value": module_doc}, parent=self.module)
114
115
  )
115
116
  # import node
116
- self.module.body.append(ModuleImports(parent=self.module))
117
+ # fallback add ModuleImports.kwargs["transform"] if target language not allowed use multiple import directives
118
+ self.module.body.append(
119
+ ModuleImports(parent=self.module, kwargs={"transforms": []})
120
+ )
121
+ self.module.body.append(
122
+ ModuleTransformImports(
123
+ parent=self.module, kwargs={"transforms": []}
124
+ )
125
+ )
126
+ self.module.body.append(ModuleUtilities(parent=self.module))
117
127
  # CodeStart hook node
118
128
  self.module.body.append(CodeStart(parent=self.module))
119
129
  return self
@@ -398,10 +408,23 @@ class AstBuilder:
398
408
  # generate typedef as annotation node
399
409
  return st
400
410
 
411
+ @staticmethod
412
+ def is_collected_type_transform(transforms: list, target: object) -> bool:
413
+ for t in transforms:
414
+ if isinstance(t, type(target)):
415
+ return True
416
+ return False
417
+
401
418
  def add_struct_parsers(self, *struct_parsers: Type[BaseSchema]) -> Self:
402
419
  ssc_structs = []
403
420
  ssc_typedefs = []
421
+ ssc_transforms = []
404
422
  for schema in struct_parsers:
423
+ if schema.__SSC_TRANSFORMS__:
424
+ # check current used transforms for avoid generate duplicate imports
425
+ for t in schema.__SSC_TRANSFORMS__:
426
+ if not any(isinstance(i, type(t)) for i in ssc_transforms):
427
+ ssc_transforms.append(t)
405
428
  if is_literals_only_schema(schema):
406
429
  if schema.__SCHEMA_TYPE__ != StructType.ITEM:
407
430
  raise TypeError(
@@ -420,6 +443,21 @@ class AstBuilder:
420
443
  continue
421
444
  ssc_typedefs.append(typedef)
422
445
  # assembly
446
+
447
+ # insert transforms imports (in high-level API provide insert dependencies)
448
+
449
+ node_transform_import = self.module.find_node_by_token(
450
+ ModuleTransformImports.kind
451
+ )
452
+ node_transform_import = cast(
453
+ ModuleTransformImports, node_transform_import
454
+ )
455
+ node_transform_import.kwargs["transforms"] = ssc_transforms.copy()
456
+
457
+ node_import = self.module.find_node_by_token(ModuleImports.kind)
458
+ node_import = cast(ModuleImports, node_import)
459
+ node_import.kwargs["transform"] = ssc_transforms.copy()
460
+
423
461
  self.module.body.extend(ssc_typedefs)
424
462
  self.module.body.extend(ssc_structs)
425
463
  self.module.body.append(CodeEnd(parent=self.module))
@@ -51,6 +51,9 @@ from typing_extensions import assert_never
51
51
 
52
52
  from ssc_codegen.ast_ import (
53
53
  Docstring,
54
+ ExprTransform,
55
+ ModuleTransformImports,
56
+ ModuleUtilities,
54
57
  StructParser,
55
58
  ExprReturn,
56
59
  ExprNoReturn,
@@ -393,21 +396,37 @@ def pre_docstring(node: Docstring) -> str:
393
396
 
394
397
  @CONVERTER(ModuleImports.kind)
395
398
  def pre_module_imports(_node: ModuleImports) -> str:
399
+ # TODO: resolve imports
400
+ return IMPORTS.replace("$PACKAGE$", CONVERTER.PACKAGE)
401
+
402
+
403
+ @CONVERTER(ModuleTransformImports.kind)
404
+ def pre_transform_imports(node: ModuleTransformImports) -> str:
405
+ transforms, *_ = node.unpack_args()
406
+ if transforms:
407
+ for t in transforms:
408
+ if t.collect_dependencies("go_goquery"):
409
+ raise NotImplementedError(
410
+ "Current converter implementation not support pass extra import dependencies"
411
+ )
412
+ return ""
413
+
414
+
415
+ @CONVERTER(ExprTransform.kind)
416
+ def pre_transform(node: ExprTransform) -> str:
417
+ prv, nxt = prev_next_var(node)
418
+ transform, *_ = node.unpack_args()
419
+ parts = transform.emit("go_goquery", prv, nxt)
420
+ return "\n" + "\n".join(parts)
421
+
422
+
423
+ @CONVERTER(ModuleUtilities.kind)
424
+ def pre_module_utilities(_: ModuleUtilities) -> str:
396
425
  # HACK:
397
- # golang not allowed override functions, check loaded helper functions by flag
398
- helper_functions = (
399
- HELPER_FUNCTIONS if not CONVERTER.HELPER_FUNCS_IMPORTED else ""
400
- )
401
- code = (
402
- # magic constant
403
- IMPORTS.replace("$PACKAGE$", CONVERTER.PACKAGE)
404
- # HACK: push helper functions to ModuleImports token:
405
- # its generated second after module docstring
406
- + helper_functions
407
- )
426
+ # golang not allowed override or duplicate functions, check loaded helper functions by flag
408
427
  if not CONVERTER.HELPER_FUNCS_IMPORTED:
409
428
  CONVERTER.HELPER_FUNCS_IMPORTED = True
410
- return code
429
+ return HELPER_FUNCTIONS if not CONVERTER.HELPER_FUNCS_IMPORTED else ""
411
430
 
412
431
 
413
432
  def get_typedef_field_by_name(node: TypeDef, field_name: str) -> str:
@@ -23,6 +23,9 @@ from typing import cast
23
23
 
24
24
  from ssc_codegen.ast_ import (
25
25
  Docstring,
26
+ ExprTransform,
27
+ ModuleTransformImports,
28
+ ModuleUtilities,
26
29
  StructParser,
27
30
  ExprReturn,
28
31
  ExprNoReturn,
@@ -173,7 +176,11 @@ DOCSTR_START = "/**"
173
176
  DOCSTR_END = "*/"
174
177
  DOCSTR_SEP = "* "
175
178
  CONVERTER = BaseCodeConverter(debug_comment_prefix="// ")
176
- # javascript not support typing/annotations
179
+ CONVERTER.TEST_EXCLUDE_NODES.extend(
180
+ [
181
+ ModuleImports.kind, # not use imports
182
+ ]
183
+ )
177
184
 
178
185
 
179
186
  # TODO: move to string_utils
@@ -213,8 +220,30 @@ def py_sequence_to_js_array(values: tuple[str, ...] | list[str]) -> str:
213
220
  return "[" + val_arr[1:-1] + "]"
214
221
 
215
222
 
216
- @CONVERTER(ModuleImports.kind)
217
- def pre_imports(_: ModuleImports) -> str:
223
+ @CONVERTER(ModuleTransformImports.kind)
224
+ def pre_transform_imports(node: ModuleTransformImports) -> str:
225
+ transforms, *_ = node.unpack_args()
226
+
227
+ if not transforms:
228
+ return ""
229
+
230
+ code_imports = []
231
+ for t in transforms:
232
+ if deps := t.collect_dependencies("js_pure"):
233
+ code_imports.append("\n".join(deps))
234
+ return "\n".join(code_imports)
235
+
236
+
237
+ @CONVERTER(ExprTransform.kind)
238
+ def pre_transform(node: ExprTransform) -> str:
239
+ # extend transform search provide py_bs4 module
240
+ prv, nxt = prev_next_var(node)
241
+ transform, *_ = node.unpack_args()
242
+ return "\n".join(f"{i};" for i in transform.emit("js_pure", prv, nxt))
243
+
244
+
245
+ @CONVERTER(ModuleUtilities.kind)
246
+ def pre_imports(_: ModuleUtilities) -> str:
218
247
  return HELPER_FUNCTIONS
219
248
 
220
249
 
@@ -28,6 +28,9 @@ from typing import Any, cast
28
28
 
29
29
  from ssc_codegen.ast_ import (
30
30
  Docstring,
31
+ ExprTransform,
32
+ ModuleTransformImports,
33
+ ModuleUtilities,
31
34
  StructParser,
32
35
  ExprReturn,
33
36
  ExprNoReturn,
@@ -187,6 +190,7 @@ CONVERTER.TEST_EXCLUDE_NODES.extend(
187
190
  FilterDocHasText.kind,
188
191
  FilterDocIsRegexText.kind,
189
192
  FilterDocIsRegexRaw.kind,
193
+ # NOT IMPLEMENTED
190
194
  ]
191
195
  )
192
196
 
@@ -291,7 +295,34 @@ def py_regex_to_lua_rex_pcre(
291
295
 
292
296
  @CONVERTER(ModuleImports.kind)
293
297
  def pre_imports(_: ModuleImports) -> str:
294
- return IMPORTS + "\n" + HELPER_FUNCTIONS
298
+ return IMPORTS
299
+
300
+
301
+ @CONVERTER(ModuleTransformImports.kind)
302
+ def pre_transform_imports(node: ModuleTransformImports) -> str:
303
+ transforms, *_ = node.unpack_args()
304
+
305
+ if not transforms:
306
+ return ""
307
+
308
+ code_imports = []
309
+ for t in transforms:
310
+ if deps := t.collect_dependencies("lua_htmlparser"):
311
+ code_imports.append("\n".join(deps))
312
+ return "\n".join(code_imports)
313
+
314
+
315
+ @CONVERTER(ModuleUtilities.kind)
316
+ def pre_utilities(_: ModuleUtilities) -> str:
317
+ return HELPER_FUNCTIONS
318
+
319
+
320
+ @CONVERTER(ExprTransform.kind)
321
+ def pre_transform(node: ExprTransform) -> str:
322
+ prv, nxt = prev_next_var(node)
323
+ transform, *_ = node.unpack_args()
324
+ parts = transform.emit("lua_htmlparser", prv, nxt)
325
+ return "\n" + "\n".join(parts)
295
326
 
296
327
 
297
328
  @CONVERTER(Docstring.kind)
@@ -35,7 +35,10 @@ from typing import cast
35
35
 
36
36
  from ssc_codegen.ast_ import (
37
37
  Docstring,
38
+ ExprTransform,
38
39
  ModuleImports,
40
+ ModuleTransformImports,
41
+ ModuleUtilities,
39
42
  TypeDef,
40
43
  TypeDefField,
41
44
  JsonStruct,
@@ -214,6 +217,9 @@ class BasePyCodeConverter(BaseCodeConverter):
214
217
  self.pre_definitions = {
215
218
  Docstring.kind: pre_docstring,
216
219
  ModuleImports.kind: pre_imports,
220
+ ModuleTransformImports.kind: pre_transform_imports,
221
+ ExprTransform.kind: pre_transform,
222
+ ModuleUtilities.kind: pre_utilities,
217
223
  TypeDefField.kind: pre_typedef_field,
218
224
  JsonStruct.kind: pre_json_struct,
219
225
  JsonStructField.kind: pre_json_field,
@@ -295,7 +301,6 @@ class BasePyCodeConverter(BaseCodeConverter):
295
301
  }
296
302
 
297
303
  self.post_definitions = {
298
- ModuleImports.kind: post_imports,
299
304
  JsonStruct.kind: post_json_struct,
300
305
  ExprFilter.kind: post_expr_filter,
301
306
  FilterAnd.kind: post_filter_and,
@@ -321,6 +326,16 @@ class BasePyCodeConverter(BaseCodeConverter):
321
326
  }
322
327
 
323
328
 
329
+ def pre_transform(node: ExprTransform) -> str:
330
+ prv, nxt = prev_next_var(node)
331
+ transform, *_ = node.unpack_args()
332
+ parts = transform.emit("py_base", prv, nxt)
333
+ if have_default_expr(node):
334
+ code = INDENT_DEFAULT_BODY + INDENT_DEFAULT_BODY.join(parts)
335
+ return code
336
+ return INDENT_METHOD_BODY + INDENT_METHOD_BODY.join(parts)
337
+
338
+
324
339
  def get_typedef_field_by_name(node: TypeDef, field_name: str) -> str:
325
340
  value = [i for i in node.body if i.kwargs["name"] == field_name][0]
326
341
  value = cast(TypeDefField, value)
@@ -366,12 +381,25 @@ def pre_docstring(node: Docstring) -> str:
366
381
  return ""
367
382
 
368
383
 
369
- def pre_imports(_node: ModuleImports) -> str:
384
+ def pre_utilities(_: ModuleUtilities) -> str:
385
+ return HELPER_FUNCTIONS
386
+
387
+
388
+ def pre_imports(_: ModuleImports) -> str:
370
389
  return IMPORTS_MIN
371
390
 
372
391
 
373
- def post_imports(_: ModuleImports) -> str:
374
- return HELPER_FUNCTIONS
392
+ def pre_transform_imports(node: ModuleTransformImports) -> str:
393
+ transforms, *_ = node.unpack_args()
394
+
395
+ if not transforms:
396
+ return ""
397
+
398
+ code_imports = []
399
+ for t in transforms:
400
+ if deps := t.collect_dependencies("py_base"):
401
+ code_imports.append("\n".join(deps))
402
+ return "\n".join(code_imports)
375
403
 
376
404
 
377
405
  # TYPEDEF
@@ -1,6 +1,7 @@
1
1
  from ssc_codegen.ast_ import (
2
2
  ExprCss,
3
3
  ExprCssAll,
4
+ ExprTransform,
4
5
  ModuleImports,
5
6
  ExprXpathAll,
6
7
  ExprXpath,
@@ -12,6 +13,7 @@ from ssc_codegen.ast_ import (
12
13
  ExprGetHtmlRaw,
13
14
  ExprGetHtmlRawAll,
14
15
  ExprIsCss,
16
+ ModuleTransformImports,
15
17
  StructInitMethod,
16
18
  StructFieldMethod,
17
19
  StructPreValidateMethod,
@@ -64,6 +66,32 @@ from ssc_codegen.converters.templates.py_base import (
64
66
  CONVERTER = BasePyCodeConverter()
65
67
 
66
68
 
69
+ @CONVERTER.post(ModuleTransformImports.kind)
70
+ def post_transform_imports(node: ModuleTransformImports) -> str:
71
+ transforms, *_ = node.unpack_args()
72
+
73
+ if not transforms:
74
+ return ""
75
+
76
+ code_imports = []
77
+ for t in transforms:
78
+ if deps := t.collect_dependencies("py_bs4"):
79
+ code_imports.append("\n".join(deps))
80
+ return "\n".join(code_imports)
81
+
82
+
83
+ @CONVERTER.post(ExprTransform.kind)
84
+ def post_transform(node: ExprTransform) -> str:
85
+ # extend transform search provide py_bs4 module
86
+ prv, nxt = prev_next_var(node)
87
+ transform, *_ = node.unpack_args()
88
+ parts = transform.emit("py_bs4", prv, nxt)
89
+ if have_default_expr(node):
90
+ code = INDENT_DEFAULT_BODY + INDENT_DEFAULT_BODY.join(parts)
91
+ return code
92
+ return INDENT_METHOD_BODY + INDENT_METHOD_BODY.join(parts)
93
+
94
+
67
95
  @CONVERTER(StructInitMethod.kind)
68
96
  def pre_init(_node: StructInitMethod) -> str:
69
97
  return (
@@ -102,10 +130,10 @@ def pre_struct_field_method(node: StructFieldMethod) -> str:
102
130
 
103
131
 
104
132
  @CONVERTER.pre(ModuleImports.kind)
105
- def pre_imports(_: ModuleImports) -> str:
133
+ def pre_imports(node: ModuleImports) -> str:
106
134
  return (
107
135
  IMPORTS_MIN
108
- + "from bs4 import BeautifulSoup, ResultSet, Tag # noqa (for typing)"
136
+ + "from bs4 import BeautifulSoup, ResultSet, Tag # noqa (for typing)\n"
109
137
  )
110
138
 
111
139
 
@@ -1,6 +1,7 @@
1
1
  from ssc_codegen.ast_ import (
2
2
  ExprCss,
3
3
  ExprCssAll,
4
+ ExprTransform,
4
5
  ModuleImports,
5
6
  ExprXpathAll,
6
7
  ExprXpath,
@@ -12,6 +13,8 @@ from ssc_codegen.ast_ import (
12
13
  ExprGetHtmlRaw,
13
14
  ExprGetHtmlRawAll,
14
15
  ExprIsCss,
16
+ ModuleTransformImports,
17
+ ModuleUtilities,
15
18
  StructInitMethod,
16
19
  StructFieldMethod,
17
20
  StructPreValidateMethod,
@@ -62,24 +65,6 @@ from ssc_codegen.converters.templates.py_base import IMPORTS_MIN
62
65
 
63
66
  CONVERTER = BasePyCodeConverter()
64
67
 
65
- """
66
- # HtmlElement → сразу принимаем
67
- if isinstance(document, html.HtmlElement):
68
- self._document = document
69
- return
70
-
71
- # Строка → пробуем парсить
72
- if isinstance(document, str):
73
- try:
74
- self._document = html.fromstring(document.strip() or self.FALLBACK_HTML)
75
- return
76
- except (ParserError, ValueError):
77
- pass
78
-
79
- # Всё остальное или ошибка → заглушка
80
- self._document = html.fromstring(self.FALLBACK_HTML)
81
- """
82
-
83
68
 
84
69
  @CONVERTER(StructInitMethod.kind)
85
70
  def pre_init(_node: StructInitMethod) -> str:
@@ -123,15 +108,42 @@ def pre_struct_field_method(node: StructFieldMethod) -> str:
123
108
 
124
109
 
125
110
  @CONVERTER(ModuleImports.kind)
126
- def pre_imports(_: ModuleImports) -> str:
111
+ def pre_imports(node: ModuleImports) -> str:
127
112
  # lxml throw parse error if passed empty string
128
113
  # most mainsteram html parser libs use `FALLBACK_HTML_STR` stub value
129
- return (
130
- IMPORTS_MIN
131
- + """from lxml import html
132
- FALLBACK_HTML_STR = "<html><body></body></html>"
133
- """
134
- )
114
+
115
+ return IMPORTS_MIN + "from lxml import html"
116
+
117
+
118
+ @CONVERTER.post(ModuleTransformImports.kind)
119
+ def post_transform_imports(node: ModuleTransformImports) -> str:
120
+ transforms, *_ = node.unpack_args()
121
+
122
+ if not transforms:
123
+ return ""
124
+
125
+ code_imports = []
126
+ for t in transforms:
127
+ if deps := t.collect_dependencies("py_lxml"):
128
+ code_imports.append("\n".join(deps))
129
+ return "\n".join(code_imports)
130
+
131
+
132
+ @CONVERTER.post(ExprTransform.kind)
133
+ def post_transform(node: ExprTransform) -> str:
134
+ # extend transform search provide py_bs4 module
135
+ prv, nxt = prev_next_var(node)
136
+ transform, *_ = node.unpack_args()
137
+ parts = transform.emit("py_lxml", prv, nxt)
138
+ if have_default_expr(node):
139
+ code = INDENT_DEFAULT_BODY + INDENT_DEFAULT_BODY.join(parts)
140
+ return code
141
+ return INDENT_METHOD_BODY + INDENT_METHOD_BODY.join(parts)
142
+
143
+
144
+ @CONVERTER.post(ModuleUtilities.kind)
145
+ def post_utilities(_: ModuleUtilities) -> str:
146
+ return 'FALLBACK_HTML_STR = "<html><body></body></html>"'
135
147
 
136
148
 
137
149
  @CONVERTER(ExprCss.kind)
@@ -1,6 +1,7 @@
1
1
  from ssc_codegen.ast_ import (
2
2
  ExprCss,
3
3
  ExprCssAll,
4
+ ExprTransform,
4
5
  ModuleImports,
5
6
  ExprXpathAll,
6
7
  ExprXpath,
@@ -12,6 +13,7 @@ from ssc_codegen.ast_ import (
12
13
  ExprGetHtmlRaw,
13
14
  ExprGetHtmlRawAll,
14
15
  ExprIsCss,
16
+ ModuleTransformImports,
15
17
  StructInitMethod,
16
18
  StructFieldMethod,
17
19
  StructPreValidateMethod,
@@ -62,6 +64,32 @@ from ssc_codegen.converters.templates.py_base import IMPORTS_MIN
62
64
  CONVERTER = BasePyCodeConverter()
63
65
 
64
66
 
67
+ @CONVERTER.post(ModuleTransformImports.kind)
68
+ def post_transform_imports(node: ModuleTransformImports) -> str:
69
+ transforms, *_ = node.unpack_args()
70
+
71
+ if not transforms:
72
+ return ""
73
+
74
+ code_imports = []
75
+ for t in transforms:
76
+ if deps := t.collect_dependencies("py_parsel"):
77
+ code_imports.append("\n".join(deps))
78
+ return "\n".join(code_imports)
79
+
80
+
81
+ @CONVERTER.post(ExprTransform.kind)
82
+ def post_transform(node: ExprTransform) -> str:
83
+ # extend transform search provide py_bs4 module
84
+ prv, nxt = prev_next_var(node)
85
+ transform, *_ = node.unpack_args()
86
+ parts = transform.emit("py_parsel", prv, nxt)
87
+ if have_default_expr(node):
88
+ code = INDENT_DEFAULT_BODY + INDENT_DEFAULT_BODY.join(parts)
89
+ return code
90
+ return INDENT_METHOD_BODY + INDENT_METHOD_BODY.join(parts)
91
+
92
+
65
93
  @CONVERTER(StructInitMethod.kind)
66
94
  def pre_init(_node: StructInitMethod) -> str:
67
95
  return (
@@ -1,6 +1,7 @@
1
1
  from ssc_codegen.ast_ import (
2
2
  ExprCss,
3
3
  ExprCssAll,
4
+ ExprTransform,
4
5
  ModuleImports,
5
6
  ExprXpathAll,
6
7
  ExprXpath,
@@ -12,6 +13,7 @@ from ssc_codegen.ast_ import (
12
13
  ExprGetHtmlRaw,
13
14
  ExprGetHtmlRawAll,
14
15
  ExprIsCss,
16
+ ModuleTransformImports,
15
17
  StructInitMethod,
16
18
  StructFieldMethod,
17
19
  StructPreValidateMethod,
@@ -62,6 +64,32 @@ from ssc_codegen.converters.templates.py_base import IMPORTS_MIN
62
64
  CONVERTER = BasePyCodeConverter()
63
65
 
64
66
 
67
+ @CONVERTER.post(ModuleTransformImports.kind)
68
+ def post_transform_imports(node: ModuleTransformImports) -> str:
69
+ transforms, *_ = node.unpack_args()
70
+
71
+ if not transforms:
72
+ return ""
73
+
74
+ code_imports = []
75
+ for t in transforms:
76
+ if deps := t.collect_dependencies("py_selectolax"):
77
+ code_imports.append("\n".join(deps))
78
+ return "\n".join(code_imports)
79
+
80
+
81
+ @CONVERTER.post(ExprTransform.kind)
82
+ def post_transform(node: ExprTransform) -> str:
83
+ # extend transform search provide py_bs4 module
84
+ prv, nxt = prev_next_var(node)
85
+ transform, *_ = node.unpack_args()
86
+ parts = transform.emit("py_selectolax", prv, nxt)
87
+ if have_default_expr(node):
88
+ code = INDENT_DEFAULT_BODY + INDENT_DEFAULT_BODY.join(parts)
89
+ return code
90
+ return INDENT_METHOD_BODY + INDENT_METHOD_BODY.join(parts)
91
+
92
+
65
93
  @CONVERTER(StructInitMethod.kind)
66
94
  def pre_init(_node: StructInitMethod) -> str:
67
95
  return (
@@ -2,7 +2,16 @@
2
2
 
3
3
  from functools import wraps
4
4
  import logging
5
- from typing import Any, Callable, Type, Pattern, Sequence, TypeVar, Union
5
+ from typing import (
6
+ Any,
7
+ Callable,
8
+ Type,
9
+ Pattern,
10
+ Sequence,
11
+ TypeVar,
12
+ Union,
13
+ TYPE_CHECKING,
14
+ )
6
15
  from re import Pattern as RePattern
7
16
 
8
17
  from cssselect import SelectorSyntaxError
@@ -13,6 +22,7 @@ from ssc_codegen.ast_ import (
13
22
  ExprDefaultValueWrapper,
14
23
  ExprCss,
15
24
  ExprCssAll,
25
+ ExprTransform,
16
26
  ExprXpathAll,
17
27
  ExprGetHtmlAttr,
18
28
  ExprGetHtmlText,
@@ -125,6 +135,9 @@ from ssc_codegen.schema import BaseSchema
125
135
  from ssc_codegen.selector_utils import validate_css_query, validate_xpath_query
126
136
  from ssc_codegen.tokens import TokenType, VariableType
127
137
 
138
+ if TYPE_CHECKING:
139
+ from .transform import BaseTransform
140
+
128
141
  LOGGER = logging.getLogger("ssc_gen")
129
142
 
130
143
  T = TypeVar("T", bound="BaseDocument")
@@ -2626,3 +2639,23 @@ class DocumentElementsFilter(BaseDocument):
2626
2639
  new_filter = DocumentElementsFilter()
2627
2640
  new_filter.stack.extend(self.stack)
2628
2641
  return DocumentElementsFilter().not_(new_filter)
2642
+
2643
+
2644
+ class TransformDocument(BaseDocument):
2645
+ def transform(self, transformer: "BaseTransform") -> Self:
2646
+ # manual test types from transformer
2647
+ if self.stack_last_ret != transformer.accept_type:
2648
+ LOGGER.warning(
2649
+ "transform() [%s]: Expected type(s) %s, got %s",
2650
+ transformer.__class__.__name__,
2651
+ transformer.accept_type.name,
2652
+ self.stack_last_ret.name,
2653
+ )
2654
+ self._add(
2655
+ ExprTransform(
2656
+ kwargs={"transform": transformer},
2657
+ accept_type=transformer.accept_type,
2658
+ ret_type=transformer.return_type,
2659
+ )
2660
+ )
2661
+ return self
@@ -9,6 +9,7 @@ from .tokens import StructType, TokenType, VariableType
9
9
  if TYPE_CHECKING:
10
10
  from .document import BaseDocument, ClassVarDocument
11
11
  from .json_struct import Json
12
+ from .transform import BaseTransform
12
13
 
13
14
 
14
15
  class MISSING_FIELD(object): # noqa
@@ -155,6 +156,16 @@ class SchemaMeta(type):
155
156
  tmp_cvars.pop(cvar_name)
156
157
  break
157
158
 
159
+ @staticmethod
160
+ def __resolve_self_transforms(cls: Type["BaseSchema"]) -> None:
161
+ # called after __fill_mro_fields(cls)
162
+ result: list["BaseTransform"] = []
163
+ for field in cls.__SSC_MRO_FIELDS__.values():
164
+ for node in field.stack:
165
+ if node.kind == TokenType.TRANSFORM:
166
+ result.append(node.kwargs["transform"])
167
+ cls.__SSC_TRANSFORMS__ = result
168
+
158
169
  def __new__(mcs, name, bases, namespace, **kwargs): # type: ignore
159
170
  cls = super().__new__(mcs, name, bases, namespace)
160
171
  cls = cast(Type["BaseSchema"], cls)
@@ -169,7 +180,7 @@ class SchemaMeta(type):
169
180
  mcs.__fill_mro_fields(cls)
170
181
  mcs.__fill_literals(cls, name)
171
182
  mcs.__resolve_self_classvars(cls)
172
-
183
+ mcs.__resolve_self_transforms(cls)
173
184
  return cls
174
185
 
175
186
 
@@ -213,6 +224,9 @@ class BaseSchema(metaclass=SchemaMeta):
213
224
  # old name - literals
214
225
  __SSC_MRO_CLASSVARS__: dict[str, "ClassVarDocument"]
215
226
 
227
+ # it opmimize collect extra imports
228
+ __SSC_TRANSFORMS__: list["BaseTransform"]
229
+
216
230
  # retained for API backward compatibility
217
231
  @classmethod
218
232
  def __class_signature__(cls) -> dict[str, Any] | list[str | Any] | None:
@@ -87,6 +87,9 @@ class TokenType(IntEnum):
87
87
  due to the risk of race-condition or other side effects.
88
88
  """
89
89
 
90
+ UTILITIES = auto()
91
+ """used for part import logic and helper functions/constants"""
92
+
90
93
  # utils nodes
91
94
  CODE_START = auto()
92
95
  """insert to ast after DOCSTRING and IMPORTS tokens
@@ -100,6 +103,9 @@ class TokenType(IntEnum):
100
103
  maybe usef for inject custom code or logic
101
104
  """
102
105
 
106
+ TRANSFORM = auto()
107
+ TRANSFORM_IMPORTS = auto()
108
+
103
109
  # STRUCTS
104
110
  STRUCT = auto()
105
111
  STRUCT_INIT = auto()
@@ -0,0 +1,95 @@
1
+ from typing import Callable, ClassVar, TypedDict
2
+ from .tokens import VariableType
3
+
4
+
5
+ class EmitSpec(TypedDict):
6
+ method_name: str
7
+ dependencies: list[str]
8
+ func: Callable[..., list[str]]
9
+
10
+
11
+ def target(backend: str, dependencies: list[str] | None = None):
12
+ """decorator for provide transformation
13
+
14
+ Args:
15
+ backend - target lib (py_base, js_pure, py_bs4 etc)
16
+ dependencies - optional required imports (push to ModuleImports() AST nodes)
17
+ """
18
+ dependencies = dependencies or []
19
+
20
+ def decorator(fn: Callable[..., list[str]]) -> Callable[..., list[str]]:
21
+ setattr(fn, "_emit_backend", backend)
22
+ setattr(fn, "_emit_deps", dependencies)
23
+ return fn
24
+
25
+ return decorator
26
+
27
+
28
+ class BaseTransformMeta(type):
29
+ def __new__(mcs, name, bases, namespace):
30
+ cls = super().__new__(mcs, name, bases, namespace)
31
+
32
+ impls: dict[str, EmitSpec] = {}
33
+
34
+ for base in reversed(cls.__mro__[1:]):
35
+ base_impls = getattr(base, "_emit_impls", None)
36
+ if isinstance(base_impls, dict):
37
+ for backend, spec in base_impls.items():
38
+ impls.setdefault(
39
+ backend,
40
+ {
41
+ "method_name": spec["method_name"],
42
+ "dependencies": list(spec["dependencies"]),
43
+ "func": spec["func"],
44
+ },
45
+ )
46
+
47
+ for attr_name, attr_value in namespace.items():
48
+ if callable(attr_value) and hasattr(attr_value, "_emit_backend"):
49
+ backend = getattr(attr_value, "_emit_backend")
50
+ deps = getattr(attr_value, "_emit_deps", []) or []
51
+
52
+ if (
53
+ backend in impls
54
+ and impls[backend]["method_name"] in namespace
55
+ ):
56
+ raise ValueError(
57
+ f"Duplicate emit implementations for backend '{backend}' in class {name}"
58
+ )
59
+
60
+ impls[backend] = EmitSpec(
61
+ method_name=attr_name,
62
+ dependencies=list(deps),
63
+ func=attr_value, # type: ignore
64
+ )
65
+
66
+ setattr(cls, "_emit_impls", impls)
67
+ return cls
68
+
69
+
70
+ class BaseTransform(metaclass=BaseTransformMeta):
71
+ # change this variables for provide static analyze
72
+ accept_type: ClassVar[VariableType]
73
+ return_type: ClassVar[VariableType]
74
+
75
+ _emit_impls: ClassVar[dict[str, EmitSpec]] = {} # autofill from metaclass
76
+
77
+ @classmethod
78
+ def has_backend(cls, backend: str) -> bool:
79
+ return backend in getattr(cls, "_emit_impls", {})
80
+
81
+ @classmethod
82
+ def get_emit_spec(cls, backend: str) -> EmitSpec | None:
83
+ return getattr(cls, "_emit_impls", {}).get(backend)
84
+
85
+ def collect_dependencies(self, backend: str) -> list[str]:
86
+ spec = self.get_emit_spec(backend)
87
+ return spec["dependencies"][:] if spec else []
88
+
89
+ # Instance method to call emitter (wraps class method to bind self)
90
+ def emit(self, backend: str, prv: str, nxt: str) -> list[str]:
91
+ spec = self.get_emit_spec(backend)
92
+ if not spec:
93
+ return []
94
+ fn = getattr(self, spec["method_name"])
95
+ return fn(prv, nxt)
File without changes
File without changes
File without changes