nokolexbor 0.3.4 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/nokolexbor/nl_attribute.c +46 -0
- data/ext/nokolexbor/nl_cdata.c +8 -0
- data/ext/nokolexbor/nl_comment.c +6 -0
- data/ext/nokolexbor/nl_document.c +53 -7
- data/ext/nokolexbor/nl_document_fragment.c +9 -0
- data/ext/nokolexbor/nl_error.c +21 -19
- data/ext/nokolexbor/nl_node.c +255 -49
- data/ext/nokolexbor/nl_node_set.c +56 -1
- data/ext/nokolexbor/nl_processing_instruction.c +6 -0
- data/ext/nokolexbor/nl_text.c +6 -0
- data/ext/nokolexbor/nokolexbor.h +1 -0
- data/lib/nokolexbor/document.rb +52 -5
- data/lib/nokolexbor/document_fragment.rb +11 -0
- data/lib/nokolexbor/node.rb +367 -18
- data/lib/nokolexbor/node_set.rb +56 -0
- data/lib/nokolexbor/version.rb +1 -1
- metadata +2 -24
- data/vendor/lexbor/source/lexbor/encoding/base.h +0 -218
- data/vendor/lexbor/source/lexbor/encoding/big5.c +0 -42839
- data/vendor/lexbor/source/lexbor/encoding/config.cmake +0 -12
- data/vendor/lexbor/source/lexbor/encoding/const.h +0 -65
- data/vendor/lexbor/source/lexbor/encoding/decode.c +0 -3193
- data/vendor/lexbor/source/lexbor/encoding/decode.h +0 -370
- data/vendor/lexbor/source/lexbor/encoding/encode.c +0 -1931
- data/vendor/lexbor/source/lexbor/encoding/encode.h +0 -377
- data/vendor/lexbor/source/lexbor/encoding/encoding.c +0 -252
- data/vendor/lexbor/source/lexbor/encoding/encoding.h +0 -475
- data/vendor/lexbor/source/lexbor/encoding/euc_kr.c +0 -53883
- data/vendor/lexbor/source/lexbor/encoding/gb18030.c +0 -47905
- data/vendor/lexbor/source/lexbor/encoding/iso_2022_jp_katakana.c +0 -159
- data/vendor/lexbor/source/lexbor/encoding/jis0208.c +0 -22477
- data/vendor/lexbor/source/lexbor/encoding/jis0212.c +0 -15787
- data/vendor/lexbor/source/lexbor/encoding/multi.h +0 -53
- data/vendor/lexbor/source/lexbor/encoding/range.c +0 -71
- data/vendor/lexbor/source/lexbor/encoding/range.h +0 -34
- data/vendor/lexbor/source/lexbor/encoding/res.c +0 -222
- data/vendor/lexbor/source/lexbor/encoding/res.h +0 -34
- data/vendor/lexbor/source/lexbor/encoding/single.c +0 -13748
- data/vendor/lexbor/source/lexbor/encoding/single.h +0 -116
data/lib/nokolexbor/node_set.rb
CHANGED
@@ -4,6 +4,11 @@ module Nokolexbor
|
|
4
4
|
class NodeSet < Nokolexbor::Node
|
5
5
|
include Enumerable
|
6
6
|
|
7
|
+
# Create a NodeSet with +document+ defaulting to +list+.
|
8
|
+
#
|
9
|
+
# @yield [Document]
|
10
|
+
#
|
11
|
+
# @return [Document]
|
7
12
|
def self.new(document, list = [])
|
8
13
|
obj = allocate
|
9
14
|
obj.instance_variable_set(:@document, document)
|
@@ -12,6 +17,9 @@ module Nokolexbor
|
|
12
17
|
obj
|
13
18
|
end
|
14
19
|
|
20
|
+
# Iterate over each node.
|
21
|
+
#
|
22
|
+
# @yield [Node]
|
15
23
|
def each
|
16
24
|
return to_enum unless block_given?
|
17
25
|
|
@@ -21,6 +29,11 @@ module Nokolexbor
|
|
21
29
|
self
|
22
30
|
end
|
23
31
|
|
32
|
+
# Get the first +n+ elements of the NodeSet.
|
33
|
+
#
|
34
|
+
# @param n [Numeric,nil]
|
35
|
+
#
|
36
|
+
# @return [Node,Array<Node>] {Node} if +n+ is nil, otherwise {Array<Node>}
|
24
37
|
def first(n = nil)
|
25
38
|
return self[0] unless n
|
26
39
|
|
@@ -29,14 +42,19 @@ module Nokolexbor
|
|
29
42
|
list
|
30
43
|
end
|
31
44
|
|
45
|
+
# Get the last element of the NodeSet.
|
46
|
+
#
|
47
|
+
# @return [Node,nil]
|
32
48
|
def last
|
33
49
|
self[-1]
|
34
50
|
end
|
35
51
|
|
52
|
+
# @return [Boolean] true if this NodeSet is empty.
|
36
53
|
def empty?
|
37
54
|
length == 0
|
38
55
|
end
|
39
56
|
|
57
|
+
# @return [Integer] The index of the first node in this NodeSet that is equal to +node+ or meets the given block. Returns nil if no match is found.
|
40
58
|
def index(node = nil)
|
41
59
|
if node
|
42
60
|
each_with_index { |member, j| return j if member == node }
|
@@ -46,6 +64,9 @@ module Nokolexbor
|
|
46
64
|
nil
|
47
65
|
end
|
48
66
|
|
67
|
+
# Get the content of all contained Nodes.
|
68
|
+
#
|
69
|
+
# @return [String]
|
49
70
|
def content
|
50
71
|
self.map(&:content).join
|
51
72
|
end
|
@@ -54,10 +75,16 @@ module Nokolexbor
|
|
54
75
|
alias_method :inner_text, :content
|
55
76
|
alias_method :to_str, :content
|
56
77
|
|
78
|
+
# Get the inner html of all contained Nodes.
|
79
|
+
#
|
80
|
+
# @return [String]
|
57
81
|
def inner_html(*args)
|
58
82
|
self.map { |n| n.inner_html(*args) }.join
|
59
83
|
end
|
60
84
|
|
85
|
+
# Convert this NodeSet to HTML.
|
86
|
+
#
|
87
|
+
# @return [String]
|
61
88
|
def outer_html(*args)
|
62
89
|
self.map { |n| n.outer_html(*args) }.join
|
63
90
|
end
|
@@ -66,6 +93,9 @@ module Nokolexbor
|
|
66
93
|
alias_method :to_html, :outer_html
|
67
94
|
alias_method :serialize, :outer_html
|
68
95
|
|
96
|
+
# Remove all nodes in this NodeSet.
|
97
|
+
#
|
98
|
+
# @see Node#remove
|
69
99
|
def remove
|
70
100
|
self.each(&:remove)
|
71
101
|
end
|
@@ -73,22 +103,32 @@ module Nokolexbor
|
|
73
103
|
alias_method :unlink, :remove
|
74
104
|
alias_method :to_ary, :to_a
|
75
105
|
|
106
|
+
# Destroy all nodes in the NodeSet.
|
107
|
+
#
|
108
|
+
# @see Node#destroy
|
76
109
|
def destroy
|
77
110
|
self.each(&:destroy)
|
78
111
|
end
|
79
112
|
|
113
|
+
# @return [Node,nil] The last element of this NodeSet and removes it. Returns
|
114
|
+
# +nil+ if the set is empty.
|
80
115
|
def pop
|
81
116
|
return nil if length == 0
|
82
117
|
|
83
118
|
delete(last)
|
84
119
|
end
|
85
120
|
|
121
|
+
# @return [Node,nil] The first element of this NodeSet and removes it. Returns
|
122
|
+
# +nil+ if the set is empty.
|
86
123
|
def shift
|
87
124
|
return nil if length == 0
|
88
125
|
|
89
126
|
delete(first)
|
90
127
|
end
|
91
128
|
|
129
|
+
# @return [Boolean] true if two NodeSets contain the same number
|
130
|
+
# of elements and each element is equal to the corresponding
|
131
|
+
# element in the other NodeSet.
|
92
132
|
def ==(other)
|
93
133
|
return false unless other.is_a?(NodeSet)
|
94
134
|
return false unless length == other.length
|
@@ -99,6 +139,8 @@ module Nokolexbor
|
|
99
139
|
true
|
100
140
|
end
|
101
141
|
|
142
|
+
# @return [NodeSet] A new NodeSet containing all the children of all the nodes in
|
143
|
+
# the NodeSet.
|
102
144
|
def children
|
103
145
|
node_set = NodeSet.new(@document)
|
104
146
|
each do |node|
|
@@ -107,6 +149,8 @@ module Nokolexbor
|
|
107
149
|
node_set
|
108
150
|
end
|
109
151
|
|
152
|
+
# @return [NodeSet] A new NodeSet containing all the nodes in the NodeSet
|
153
|
+
# in reverse order.
|
110
154
|
def reverse
|
111
155
|
node_set = NodeSet.new(@document)
|
112
156
|
(length - 1).downto(0) do |x|
|
@@ -115,6 +159,17 @@ module Nokolexbor
|
|
115
159
|
node_set
|
116
160
|
end
|
117
161
|
|
162
|
+
# Wrap all nodes of this NodeSet with +node_or_tags+.
|
163
|
+
#
|
164
|
+
# @see Node#wrap
|
165
|
+
#
|
166
|
+
# @return [NodeSet] +self+, to support chaining.
|
167
|
+
def wrap(node_or_tags)
|
168
|
+
map { |node| node.wrap(node_or_tags) }
|
169
|
+
self
|
170
|
+
end
|
171
|
+
|
172
|
+
# (see Node#xpath)
|
118
173
|
def xpath(*args)
|
119
174
|
paths, handler, ns, binds = extract_params(args)
|
120
175
|
|
@@ -127,6 +182,7 @@ module Nokolexbor
|
|
127
182
|
end
|
128
183
|
end
|
129
184
|
|
185
|
+
# (see Node#nokogiri_css)
|
130
186
|
def nokogiri_css(*args)
|
131
187
|
rules, handler, ns, _ = extract_params(args)
|
132
188
|
paths = css_rules_to_xpath(rules, ns)
|
data/lib/nokolexbor/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokolexbor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yicheng Zhou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-01-
|
11
|
+
date: 2023-01-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake-compiler
|
@@ -259,28 +259,6 @@ files:
|
|
259
259
|
- vendor/lexbor/source/lexbor/dom/interfaces/shadow_root.h
|
260
260
|
- vendor/lexbor/source/lexbor/dom/interfaces/text.c
|
261
261
|
- vendor/lexbor/source/lexbor/dom/interfaces/text.h
|
262
|
-
- vendor/lexbor/source/lexbor/encoding/base.h
|
263
|
-
- vendor/lexbor/source/lexbor/encoding/big5.c
|
264
|
-
- vendor/lexbor/source/lexbor/encoding/config.cmake
|
265
|
-
- vendor/lexbor/source/lexbor/encoding/const.h
|
266
|
-
- vendor/lexbor/source/lexbor/encoding/decode.c
|
267
|
-
- vendor/lexbor/source/lexbor/encoding/decode.h
|
268
|
-
- vendor/lexbor/source/lexbor/encoding/encode.c
|
269
|
-
- vendor/lexbor/source/lexbor/encoding/encode.h
|
270
|
-
- vendor/lexbor/source/lexbor/encoding/encoding.c
|
271
|
-
- vendor/lexbor/source/lexbor/encoding/encoding.h
|
272
|
-
- vendor/lexbor/source/lexbor/encoding/euc_kr.c
|
273
|
-
- vendor/lexbor/source/lexbor/encoding/gb18030.c
|
274
|
-
- vendor/lexbor/source/lexbor/encoding/iso_2022_jp_katakana.c
|
275
|
-
- vendor/lexbor/source/lexbor/encoding/jis0208.c
|
276
|
-
- vendor/lexbor/source/lexbor/encoding/jis0212.c
|
277
|
-
- vendor/lexbor/source/lexbor/encoding/multi.h
|
278
|
-
- vendor/lexbor/source/lexbor/encoding/range.c
|
279
|
-
- vendor/lexbor/source/lexbor/encoding/range.h
|
280
|
-
- vendor/lexbor/source/lexbor/encoding/res.c
|
281
|
-
- vendor/lexbor/source/lexbor/encoding/res.h
|
282
|
-
- vendor/lexbor/source/lexbor/encoding/single.c
|
283
|
-
- vendor/lexbor/source/lexbor/encoding/single.h
|
284
262
|
- vendor/lexbor/source/lexbor/html/base.h
|
285
263
|
- vendor/lexbor/source/lexbor/html/config.cmake
|
286
264
|
- vendor/lexbor/source/lexbor/html/encoding.c
|
@@ -1,218 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* Copyright (C) 2019 Alexander Borisov
|
3
|
-
*
|
4
|
-
* Author: Alexander Borisov <borisov@lexbor.com>
|
5
|
-
*/
|
6
|
-
|
7
|
-
#ifndef LEXBOR_ENCODING_BASE_H
|
8
|
-
#define LEXBOR_ENCODING_BASE_H
|
9
|
-
|
10
|
-
#ifdef __cplusplus
|
11
|
-
extern "C" {
|
12
|
-
#endif
|
13
|
-
|
14
|
-
#include "lexbor/core/base.h"
|
15
|
-
#include "lexbor/encoding/const.h"
|
16
|
-
|
17
|
-
|
18
|
-
#define LXB_ENCODING_VERSION_MAJOR 2
|
19
|
-
#define LXB_ENCODING_VERSION_MINOR 0
|
20
|
-
#define LXB_ENCODING_VERSION_PATCH 1
|
21
|
-
|
22
|
-
#define LXB_ENCODING_VERSION_STRING \
|
23
|
-
LEXBOR_STRINGIZE(LXB_ENCODING_VERSION_MAJOR) "." \
|
24
|
-
LEXBOR_STRINGIZE(LXB_ENCODING_VERSION_MINOR) "." \
|
25
|
-
LEXBOR_STRINGIZE(LXB_ENCODING_VERSION_PATCH)
|
26
|
-
|
27
|
-
|
28
|
-
#define LXB_ENCODING_REPLACEMENT_BYTES ((lxb_char_t *) "\xEF\xBF\xBD")
|
29
|
-
|
30
|
-
#define LXB_ENCODING_REPLACEMENT_BUFFER_LEN 1
|
31
|
-
#define LXB_ENCODING_REPLACEMENT_BUFFER \
|
32
|
-
(&(const lxb_codepoint_t) {LXB_ENCODING_REPLACEMENT_CODEPOINT})
|
33
|
-
|
34
|
-
|
35
|
-
/*
|
36
|
-
* In UTF-8 0x10FFFF value is maximum (inclusive)
|
37
|
-
*/
|
38
|
-
enum {
|
39
|
-
LXB_ENCODING_REPLACEMENT_SIZE = 0x03,
|
40
|
-
LXB_ENCODING_REPLACEMENT_CODEPOINT = 0xFFFD,
|
41
|
-
LXB_ENCODING_MAX_CODEPOINT = 0x10FFFF,
|
42
|
-
LXB_ENCODING_ERROR_CODEPOINT = 0x1FFFFF
|
43
|
-
};
|
44
|
-
|
45
|
-
enum {
|
46
|
-
LXB_ENCODING_ENCODE_OK = 0x00,
|
47
|
-
LXB_ENCODING_ENCODE_ERROR = -0x01,
|
48
|
-
LXB_ENCODING_ENCODE_SMALL_BUFFER = -0x02
|
49
|
-
};
|
50
|
-
|
51
|
-
enum {
|
52
|
-
LXB_ENCODING_DECODE_MAX_CODEPOINT = LXB_ENCODING_MAX_CODEPOINT,
|
53
|
-
LXB_ENCODING_DECODE_ERROR = LXB_ENCODING_ERROR_CODEPOINT,
|
54
|
-
LXB_ENCODING_DECODE_CONTINUE = 0x2FFFFF
|
55
|
-
};
|
56
|
-
|
57
|
-
enum {
|
58
|
-
LXB_ENCODING_DECODE_2022_JP_ASCII = 0x00,
|
59
|
-
LXB_ENCODING_DECODE_2022_JP_ROMAN,
|
60
|
-
LXB_ENCODING_DECODE_2022_JP_KATAKANA,
|
61
|
-
LXB_ENCODING_DECODE_2022_JP_LEAD,
|
62
|
-
LXB_ENCODING_DECODE_2022_JP_TRAIL,
|
63
|
-
LXB_ENCODING_DECODE_2022_JP_ESCAPE_START,
|
64
|
-
LXB_ENCODING_DECODE_2022_JP_ESCAPE,
|
65
|
-
LXB_ENCODING_DECODE_2022_JP_UNSET
|
66
|
-
};
|
67
|
-
|
68
|
-
enum {
|
69
|
-
LXB_ENCODING_ENCODE_2022_JP_ASCII = 0x00,
|
70
|
-
LXB_ENCODING_ENCODE_2022_JP_ROMAN,
|
71
|
-
LXB_ENCODING_ENCODE_2022_JP_JIS0208
|
72
|
-
};
|
73
|
-
|
74
|
-
typedef struct {
|
75
|
-
unsigned need;
|
76
|
-
lxb_char_t lower;
|
77
|
-
lxb_char_t upper;
|
78
|
-
}
|
79
|
-
lxb_encoding_ctx_utf_8_t;
|
80
|
-
|
81
|
-
typedef struct {
|
82
|
-
lxb_char_t first;
|
83
|
-
lxb_char_t second;
|
84
|
-
lxb_char_t third;
|
85
|
-
}
|
86
|
-
lxb_encoding_ctx_gb18030_t;
|
87
|
-
|
88
|
-
typedef struct {
|
89
|
-
lxb_char_t lead;
|
90
|
-
bool is_jis0212;
|
91
|
-
}
|
92
|
-
lxb_encoding_ctx_euc_jp_t;
|
93
|
-
|
94
|
-
typedef struct {
|
95
|
-
lxb_char_t lead;
|
96
|
-
lxb_char_t prepand;
|
97
|
-
unsigned state;
|
98
|
-
unsigned out_state;
|
99
|
-
bool out_flag;
|
100
|
-
}
|
101
|
-
lxb_encoding_ctx_2022_jp_t;
|
102
|
-
|
103
|
-
typedef struct lxb_encoding_data lxb_encoding_data_t;
|
104
|
-
|
105
|
-
typedef struct {
|
106
|
-
const lxb_encoding_data_t *encoding_data;
|
107
|
-
|
108
|
-
/* Out buffer */
|
109
|
-
lxb_codepoint_t *buffer_out;
|
110
|
-
size_t buffer_length;
|
111
|
-
size_t buffer_used;
|
112
|
-
|
113
|
-
/*
|
114
|
-
* Bad code points will be replaced to user code point.
|
115
|
-
* If replace_to == 0 stop parsing and return error ot user.
|
116
|
-
*/
|
117
|
-
const lxb_codepoint_t *replace_to;
|
118
|
-
size_t replace_len;
|
119
|
-
|
120
|
-
/* Not for users */
|
121
|
-
lxb_codepoint_t codepoint;
|
122
|
-
lxb_codepoint_t second_codepoint;
|
123
|
-
bool prepend;
|
124
|
-
bool have_error;
|
125
|
-
|
126
|
-
lxb_status_t status;
|
127
|
-
|
128
|
-
union {
|
129
|
-
lxb_encoding_ctx_utf_8_t utf_8;
|
130
|
-
lxb_encoding_ctx_gb18030_t gb18030;
|
131
|
-
unsigned lead;
|
132
|
-
lxb_encoding_ctx_euc_jp_t euc_jp;
|
133
|
-
lxb_encoding_ctx_2022_jp_t iso_2022_jp;
|
134
|
-
} u;
|
135
|
-
}
|
136
|
-
lxb_encoding_decode_t;
|
137
|
-
|
138
|
-
typedef struct {
|
139
|
-
const lxb_encoding_data_t *encoding_data;
|
140
|
-
|
141
|
-
/* Out buffer */
|
142
|
-
lxb_char_t *buffer_out;
|
143
|
-
size_t buffer_length;
|
144
|
-
size_t buffer_used;
|
145
|
-
|
146
|
-
/*
|
147
|
-
* Bad code points will be replaced to user bytes.
|
148
|
-
* If replace_to == NULL stop parsing and return error ot user.
|
149
|
-
*/
|
150
|
-
const lxb_char_t *replace_to;
|
151
|
-
size_t replace_len;
|
152
|
-
|
153
|
-
unsigned state;
|
154
|
-
}
|
155
|
-
lxb_encoding_encode_t;
|
156
|
-
|
157
|
-
/*
|
158
|
-
* Why can't I pass a char ** to a function which expects a const char **?
|
159
|
-
* http://c-faq.com/ansi/constmismatch.html
|
160
|
-
*
|
161
|
-
* Short answer: use cast (const char **).
|
162
|
-
*
|
163
|
-
* For example:
|
164
|
-
* lxb_encoding_ctx_t ctx = {0};
|
165
|
-
* const lxb_encoding_data_t *enc;
|
166
|
-
*
|
167
|
-
* lxb_char_t *data = (lxb_char_t *) "\x81\x30\x84\x36";
|
168
|
-
*
|
169
|
-
* enc = lxb_encoding_data(LXB_ENCODING_GB18030);
|
170
|
-
*
|
171
|
-
* enc->decode(&ctx, (const lxb_char_t **) &data, data + 4);
|
172
|
-
*/
|
173
|
-
typedef lxb_status_t
|
174
|
-
(*lxb_encoding_encode_f)(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cp,
|
175
|
-
const lxb_codepoint_t *end);
|
176
|
-
|
177
|
-
typedef lxb_status_t
|
178
|
-
(*lxb_encoding_decode_f)(lxb_encoding_decode_t *ctx,
|
179
|
-
const lxb_char_t **data, const lxb_char_t *end);
|
180
|
-
|
181
|
-
typedef int8_t
|
182
|
-
(*lxb_encoding_encode_single_f)(lxb_encoding_encode_t *ctx, lxb_char_t **data,
|
183
|
-
const lxb_char_t *end, lxb_codepoint_t cp);
|
184
|
-
|
185
|
-
typedef lxb_codepoint_t
|
186
|
-
(*lxb_encoding_decode_single_f)(lxb_encoding_decode_t *ctx,
|
187
|
-
const lxb_char_t **data, const lxb_char_t *end);
|
188
|
-
|
189
|
-
struct lxb_encoding_data {
|
190
|
-
lxb_encoding_t encoding;
|
191
|
-
lxb_encoding_encode_f encode;
|
192
|
-
lxb_encoding_decode_f decode;
|
193
|
-
lxb_encoding_encode_single_f encode_single;
|
194
|
-
lxb_encoding_decode_single_f decode_single;
|
195
|
-
lxb_char_t *name;
|
196
|
-
};
|
197
|
-
|
198
|
-
typedef struct {
|
199
|
-
lxb_char_t *name;
|
200
|
-
unsigned size;
|
201
|
-
lxb_codepoint_t codepoint;
|
202
|
-
}
|
203
|
-
lxb_encoding_single_index_t;
|
204
|
-
|
205
|
-
typedef lxb_encoding_single_index_t lxb_encoding_multi_index_t;
|
206
|
-
|
207
|
-
typedef struct {
|
208
|
-
unsigned index;
|
209
|
-
lxb_codepoint_t codepoint;
|
210
|
-
}
|
211
|
-
lxb_encoding_range_index_t;
|
212
|
-
|
213
|
-
|
214
|
-
#ifdef __cplusplus
|
215
|
-
} /* extern "C" */
|
216
|
-
#endif
|
217
|
-
|
218
|
-
#endif /* LEXBOR_ENCODING_BASE_H */
|