hiroiyomi 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ruby-version +1 -0
- data/README.md +1 -3
- data/lib/hiroiyomi/html/attribute.rb +48 -2
- data/lib/hiroiyomi/html/childable.rb +12 -0
- data/lib/hiroiyomi/html/document.rb +14 -6
- data/lib/hiroiyomi/html/dom_parser.rb +38 -0
- data/lib/hiroiyomi/html/dom_parser_helper.rb +70 -0
- data/lib/hiroiyomi/html/element.rb +164 -6
- data/lib/hiroiyomi/html/text.rb +88 -0
- data/lib/hiroiyomi/parser.rb +4 -12
- data/lib/hiroiyomi/version.rb +1 -1
- data/lib/hiroiyomi.rb +4 -3
- metadata +7 -3
- data/lib/hiroiyomi/html_parser.rb +0 -191
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 0661022f19059a23cf6ec690cb47bb531f2b08225197e487648d91a8f31df1d7
|
|
4
|
+
data.tar.gz: 813da1519fe1b3da7e41775d5ca6b63198d5846e8f1624f95d9b82df0f46b35c
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 0bd9bc0554a39fc7ed25133be2cda01d2393268ef49ca1dee9c9dbc67cfa1354bb940a9e651c8236006df0c17f277eb0f821dfe4dbf3584ca6756044a65070c1
|
|
7
|
+
data.tar.gz: bc3f1e4e3ab39af9f83367a9d72fa622a801978fb840c426464cf7704ec16d9ad81f594efb360ac76c60fc0107801fbd9ce280edcdc242042404d61a11c62175
|
data/.ruby-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
2.5.1
|
data/README.md
CHANGED
|
@@ -1,8 +1,6 @@
|
|
|
1
1
|
# Hiroiyomi
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
TODO: Delete this and the text above, and describe your gem
|
|
3
|
+
Provides features to parse and filter HTML elements.
|
|
6
4
|
|
|
7
5
|
## Installation
|
|
8
6
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require '
|
|
4
|
-
require 'openssl'
|
|
3
|
+
require 'hiroiyomi/html/dom_parser_helper'
|
|
5
4
|
|
|
6
5
|
module Hiroiyomi
|
|
7
6
|
module Html
|
|
@@ -9,6 +8,53 @@ module Hiroiyomi
|
|
|
9
8
|
class Attribute
|
|
10
9
|
attr_accessor :name, :value
|
|
11
10
|
|
|
11
|
+
class << self
|
|
12
|
+
def value_of(file)
|
|
13
|
+
name = DOMParserHelper.extract_string(file)
|
|
14
|
+
return nil if name.empty?
|
|
15
|
+
value = extract_value(file)
|
|
16
|
+
Attribute.new(name, value.empty? ? nil : value)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
# name=value
|
|
22
|
+
# Check spaces and > whether value is end
|
|
23
|
+
# name="value"
|
|
24
|
+
# name='value'
|
|
25
|
+
def extract_value(file)
|
|
26
|
+
value = ''
|
|
27
|
+
open = { "'" => false, '"' => false }
|
|
28
|
+
equal = false
|
|
29
|
+
|
|
30
|
+
while (c = file.getc)
|
|
31
|
+
case c
|
|
32
|
+
when "'", '"'
|
|
33
|
+
break if open[c]
|
|
34
|
+
open_keys = open.keys
|
|
35
|
+
open_keys.delete(c)
|
|
36
|
+
if open[open_keys.first]
|
|
37
|
+
value += c
|
|
38
|
+
else
|
|
39
|
+
open[c] = true
|
|
40
|
+
end
|
|
41
|
+
else
|
|
42
|
+
if open.values.any?
|
|
43
|
+
value += c
|
|
44
|
+
elsif c == '='
|
|
45
|
+
equal = true
|
|
46
|
+
elsif ['>', ' '].include?(c)
|
|
47
|
+
file.ungetc(c)
|
|
48
|
+
break
|
|
49
|
+
elsif equal
|
|
50
|
+
value += c
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
value
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
12
58
|
def initialize(name, value = nil)
|
|
13
59
|
@name = name
|
|
14
60
|
@value = value
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require '
|
|
4
|
-
require '
|
|
3
|
+
require 'hiroiyomi/html/element'
|
|
4
|
+
require 'hiroiyomi/html/attribute'
|
|
5
|
+
require 'hiroiyomi/html/text'
|
|
6
|
+
require 'hiroiyomi/html/dom_parser_helper'
|
|
5
7
|
|
|
6
8
|
module Hiroiyomi
|
|
7
9
|
module Html
|
|
@@ -11,12 +13,18 @@ module Hiroiyomi
|
|
|
11
13
|
|
|
12
14
|
attr_accessor :root
|
|
13
15
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
+
class << self
|
|
17
|
+
def value_of(file)
|
|
18
|
+
document = new
|
|
19
|
+
return document if file.nil?
|
|
20
|
+
|
|
21
|
+
document.root = Element.value_of(file)
|
|
22
|
+
document
|
|
23
|
+
end
|
|
16
24
|
end
|
|
17
25
|
|
|
18
|
-
def
|
|
19
|
-
@root =
|
|
26
|
+
def initialize
|
|
27
|
+
@root = nil
|
|
20
28
|
end
|
|
21
29
|
|
|
22
30
|
def each
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'hiroiyomi/parser'
|
|
4
|
+
require 'hiroiyomi/html/document'
|
|
5
|
+
|
|
6
|
+
module Hiroiyomi
|
|
7
|
+
module Html
|
|
8
|
+
# DOMParser
|
|
9
|
+
class DOMParser
|
|
10
|
+
include Parser
|
|
11
|
+
|
|
12
|
+
private
|
|
13
|
+
|
|
14
|
+
def do_parse(file)
|
|
15
|
+
Document.value_of(file)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def do_filter(document, filter:, is_deep: true)
|
|
19
|
+
filtered_elements = filter_element(document, filter, [])
|
|
20
|
+
return filtered_elements unless is_deep
|
|
21
|
+
|
|
22
|
+
filtered_elements.map { |e| e.deep_select(filter) }.flatten
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def filter_element(element, filter, filtered_elements)
|
|
26
|
+
element.each do |child|
|
|
27
|
+
next if child.text?
|
|
28
|
+
if filter&.include?(child.name.downcase)
|
|
29
|
+
filtered_elements.push(child)
|
|
30
|
+
else
|
|
31
|
+
filter_element(child, filter, filtered_elements)
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
filtered_elements
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Hiroiyomi
|
|
4
|
+
module Html
|
|
5
|
+
# DOMParserHelper
|
|
6
|
+
class DOMParserHelper
|
|
7
|
+
class << self
|
|
8
|
+
def cur_pos(file, c)
|
|
9
|
+
file.ungetc(c) # In order to get current position correctly
|
|
10
|
+
cur_pos = file.pos
|
|
11
|
+
file.getc # drop <
|
|
12
|
+
cur_pos
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def skip_ignore_chars(file)
|
|
16
|
+
while (c = file.getc)
|
|
17
|
+
unless /[\\t\\n\\r\s]/.match?(c)
|
|
18
|
+
file.ungetc(c)
|
|
19
|
+
return
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# string of <.+ or ".+"
|
|
25
|
+
def extract_string(file)
|
|
26
|
+
skip_ignore_chars(file)
|
|
27
|
+
string = ''
|
|
28
|
+
while (c = file.getc)
|
|
29
|
+
case c
|
|
30
|
+
when /[\w-]/
|
|
31
|
+
string += c
|
|
32
|
+
else
|
|
33
|
+
file.ungetc(c)
|
|
34
|
+
break
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
string.gsub(/[\t\r\n]/, '').strip
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def extract_text_with_symbols(file, char_before_last_char = ']', last_char = '>')
|
|
41
|
+
string = ''
|
|
42
|
+
while (c = file.getc)
|
|
43
|
+
string += c
|
|
44
|
+
next_c = file.getc
|
|
45
|
+
if c == char_before_last_char && last_char == next_c
|
|
46
|
+
string += next_c
|
|
47
|
+
break
|
|
48
|
+
end
|
|
49
|
+
file.ungetc(next_c)
|
|
50
|
+
end
|
|
51
|
+
string
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# after <!
|
|
55
|
+
def extract_bang_text(file)
|
|
56
|
+
cur_pos = file.pos
|
|
57
|
+
case (c = file.getc)
|
|
58
|
+
when '[' # CDDATA
|
|
59
|
+
return "#{c}#{extract_text_with_symbols(file, ']')}"
|
|
60
|
+
when '-' # Comment
|
|
61
|
+
extract_text_with_symbols(file, '-')
|
|
62
|
+
return '' # Drop comments
|
|
63
|
+
end
|
|
64
|
+
file.pos = cur_pos
|
|
65
|
+
nil
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
@@ -1,25 +1,145 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require '
|
|
4
|
-
require '
|
|
3
|
+
require 'hiroiyomi/html/childable'
|
|
4
|
+
require 'hiroiyomi/html/attribute'
|
|
5
|
+
require 'hiroiyomi/html/text'
|
|
5
6
|
|
|
6
7
|
module Hiroiyomi
|
|
7
8
|
module Html
|
|
8
9
|
# Element
|
|
9
10
|
class Element
|
|
10
11
|
include Enumerable
|
|
12
|
+
include Childable
|
|
11
13
|
|
|
12
|
-
attr_accessor :name, :
|
|
14
|
+
attr_accessor :name, :parent, :attributes, :children
|
|
13
15
|
|
|
14
|
-
|
|
16
|
+
class << self
|
|
17
|
+
EXCEPTIONAL_ELEMENT_NAME_LIST = %w[script style].freeze
|
|
18
|
+
|
|
19
|
+
def value_of(file, parent_element = nil)
|
|
20
|
+
# name
|
|
21
|
+
name = extract_element_name(file)
|
|
22
|
+
|
|
23
|
+
return parent_element if name.empty?
|
|
24
|
+
|
|
25
|
+
# element
|
|
26
|
+
element = Element.new(name, parent: parent_element)
|
|
27
|
+
|
|
28
|
+
if parent_element.nil?
|
|
29
|
+
parent_element = element
|
|
30
|
+
else
|
|
31
|
+
parent_element.element = element
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# attributes
|
|
35
|
+
element.attributes = extract_attributes(file)
|
|
36
|
+
|
|
37
|
+
# exceptional elements
|
|
38
|
+
if EXCEPTIONAL_ELEMENT_NAME_LIST.include?(name.downcase)
|
|
39
|
+
element.element = extract_exceptional_element_text(file, name)
|
|
40
|
+
return parent_element
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# text if >..., close if /, or open element if >...<
|
|
44
|
+
Text.add_text_to_element_or_parse(file, element)
|
|
45
|
+
|
|
46
|
+
# close check. move element children to parent element if not closed. e.g. <img ...>
|
|
47
|
+
element.move_children_to(parent_element) unless validate_closing_element?(element, file)
|
|
48
|
+
|
|
49
|
+
parent_element
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
private
|
|
53
|
+
|
|
54
|
+
def validate_closing_element?(element, file)
|
|
55
|
+
open = false
|
|
56
|
+
|
|
57
|
+
while (c = file.getc)
|
|
58
|
+
# /> or </
|
|
59
|
+
if c == '/'
|
|
60
|
+
open = false
|
|
61
|
+
cur_pos = DOMParserHelper.cur_pos(file, c)
|
|
62
|
+
next_c = file.getc
|
|
63
|
+
return true if next_c == '>' # case of />
|
|
64
|
+
|
|
65
|
+
# Check whether name is the same or not
|
|
66
|
+
file.ungetc(next_c)
|
|
67
|
+
close_name = DOMParserHelper.extract_string(file)
|
|
68
|
+
|
|
69
|
+
return false if close_name.empty?
|
|
70
|
+
|
|
71
|
+
is_closed = close_name == element.name
|
|
72
|
+
return true if is_closed
|
|
73
|
+
|
|
74
|
+
# Try it again if name is not matched and next close element name does not exist in parent elements
|
|
75
|
+
next unless element.parents?(close_name)
|
|
76
|
+
|
|
77
|
+
file.pos = cur_pos
|
|
78
|
+
return false
|
|
79
|
+
elsif c == '<' # case of </
|
|
80
|
+
open = true
|
|
81
|
+
elsif open
|
|
82
|
+
file.ungetc(c)
|
|
83
|
+
return false
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
false
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Start from > after attributes
|
|
90
|
+
def extract_exceptional_element_text(file, name)
|
|
91
|
+
DOMParserHelper.skip_ignore_chars(file)
|
|
92
|
+
file.getc # drop >
|
|
93
|
+
string = ''
|
|
94
|
+
while (c = file.getc)
|
|
95
|
+
if c == '<'
|
|
96
|
+
cur_pos = file.pos
|
|
97
|
+
if file.getc == '/' && name == DOMParserHelper.extract_string(file)
|
|
98
|
+
DOMParserHelper.skip_ignore_chars(file)
|
|
99
|
+
file.getc # drop >
|
|
100
|
+
break
|
|
101
|
+
end
|
|
102
|
+
file.pos = cur_pos
|
|
103
|
+
end
|
|
104
|
+
string += c
|
|
105
|
+
end
|
|
106
|
+
return Text.new(string) unless string.empty?
|
|
107
|
+
nil
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def extract_element_name(file)
|
|
111
|
+
while (c = file.getc)
|
|
112
|
+
next unless c == '<'
|
|
113
|
+
cur_pos = file.pos
|
|
114
|
+
if file.getc == '!'
|
|
115
|
+
# Skip like <!document html>, <!--
|
|
116
|
+
DOMParserHelper.extract_bang_text(file)
|
|
117
|
+
next
|
|
118
|
+
end
|
|
119
|
+
file.pos = cur_pos
|
|
120
|
+
return DOMParserHelper.extract_string(file)
|
|
121
|
+
end
|
|
122
|
+
''
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def extract_attributes(file)
|
|
126
|
+
attributes = []
|
|
127
|
+
while (attribute = Attribute.value_of(file))
|
|
128
|
+
attributes.push(attribute)
|
|
129
|
+
end
|
|
130
|
+
attributes
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def initialize(name, parent: nil, attributes: [], children: [])
|
|
15
135
|
@name = name
|
|
16
|
-
@
|
|
136
|
+
@parent = parent
|
|
17
137
|
@attributes = attributes
|
|
18
138
|
@children = children
|
|
19
139
|
end
|
|
20
140
|
|
|
21
141
|
def element=(element)
|
|
22
|
-
@children.push(element)
|
|
142
|
+
@children.push(element) unless element.nil?
|
|
23
143
|
end
|
|
24
144
|
|
|
25
145
|
def each
|
|
@@ -27,6 +147,44 @@ module Hiroiyomi
|
|
|
27
147
|
yield child
|
|
28
148
|
end
|
|
29
149
|
end
|
|
150
|
+
|
|
151
|
+
def move_children_to(element)
|
|
152
|
+
each do |child|
|
|
153
|
+
element.element = child
|
|
154
|
+
end
|
|
155
|
+
children.clear
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def parents?(name)
|
|
159
|
+
return false if parent.nil?
|
|
160
|
+
return true if parent.name == name
|
|
161
|
+
parent.parents?(name)
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def deep_select(search_name_list = [], searched = [])
|
|
165
|
+
searched.push(self) if search_name_list.include?(name.downcase)
|
|
166
|
+
children.each do |child|
|
|
167
|
+
next if child.text?
|
|
168
|
+
if search_name_list.include?(child.name.downcase)
|
|
169
|
+
searched.push(child)
|
|
170
|
+
else
|
|
171
|
+
child.deep_select(search_name_list, searched)
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
searched
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
def to_s
|
|
178
|
+
attrs = attributes.map(&:to_s).join(' ')
|
|
179
|
+
attrs = ' ' + attrs unless attrs.empty?
|
|
180
|
+
"<#{name}#{attrs}>#{innerHTML}</#{name}>"
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
private
|
|
184
|
+
|
|
185
|
+
def innerHTML
|
|
186
|
+
children.map(&:to_s).join
|
|
187
|
+
end
|
|
30
188
|
end
|
|
31
189
|
end
|
|
32
190
|
end
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'hiroiyomi/html/childable'
|
|
4
|
+
require 'hiroiyomi/html/element'
|
|
5
|
+
|
|
6
|
+
module Hiroiyomi
|
|
7
|
+
module Html
|
|
8
|
+
# Text
|
|
9
|
+
class Text
|
|
10
|
+
include Childable
|
|
11
|
+
attr_accessor :value
|
|
12
|
+
|
|
13
|
+
class << self
|
|
14
|
+
# Start from > after attributes
|
|
15
|
+
def add_text_to_element_or_parse(file, element)
|
|
16
|
+
close = false
|
|
17
|
+
string = ''
|
|
18
|
+
|
|
19
|
+
append_string = lambda { |str|
|
|
20
|
+
string += str if close
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
add_text_to_element = lambda { |str = ''|
|
|
24
|
+
append_string.call str
|
|
25
|
+
string = string.gsub(/[\t\r\n]/, '').strip
|
|
26
|
+
unless string.empty?
|
|
27
|
+
element.element = new(string)
|
|
28
|
+
string = ''
|
|
29
|
+
end
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
while (c = file.getc)
|
|
33
|
+
case c
|
|
34
|
+
when '/' # /* */ ?
|
|
35
|
+
cur_pos = DOMParserHelper.cur_pos(file, c)
|
|
36
|
+
next_c = file.getc
|
|
37
|
+
if next_c == '*'
|
|
38
|
+
add_text_to_element.call "#{c}#{next_c}#{DOMParserHelper.extract_text_with_symbols(file, next_c, c)}"
|
|
39
|
+
next
|
|
40
|
+
end
|
|
41
|
+
# / is of />
|
|
42
|
+
file.pos = cur_pos
|
|
43
|
+
break
|
|
44
|
+
when '<'
|
|
45
|
+
cur_pos = DOMParserHelper.cur_pos(file, c)
|
|
46
|
+
next_c = file.getc
|
|
47
|
+
if next_c == '!'
|
|
48
|
+
bang_string = DOMParserHelper.extract_bang_text(file)
|
|
49
|
+
unless bang_string.nil?
|
|
50
|
+
# empty if comment
|
|
51
|
+
add_text_to_element.call "#{c}#{next_c}#{bang_string}" unless bang_string.empty?
|
|
52
|
+
next
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
file.pos = cur_pos
|
|
56
|
+
|
|
57
|
+
add_text_to_element.call
|
|
58
|
+
|
|
59
|
+
# Next element from < char
|
|
60
|
+
element = Element.value_of(file, element)
|
|
61
|
+
|
|
62
|
+
# file.getc # drop <
|
|
63
|
+
when '>' # > is of >...
|
|
64
|
+
close = true
|
|
65
|
+
else
|
|
66
|
+
append_string.call c
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
add_text_to_element.call
|
|
71
|
+
element
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def initialize(value)
|
|
76
|
+
@value = value
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def text?
|
|
80
|
+
true
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def to_s
|
|
84
|
+
value
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
data/lib/hiroiyomi/parser.rb
CHANGED
|
@@ -7,22 +7,14 @@ module Hiroiyomi
|
|
|
7
7
|
# Parser
|
|
8
8
|
module Parser
|
|
9
9
|
def self.included(klass)
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
#
|
|
13
|
-
# @return [Array] of Hiroiyomi::Html::Element which has been filtered
|
|
14
|
-
def klass.read(url, filter:)
|
|
15
|
-
new.read(url, filter: filter)
|
|
10
|
+
def klass.read(url, filter:, is_deep: true)
|
|
11
|
+
new.read(url, filter: filter, is_deep: is_deep)
|
|
16
12
|
end
|
|
17
13
|
end
|
|
18
14
|
|
|
19
|
-
|
|
20
|
-
# @param [Array] filter of filtered by name list, e.g. [h1, h2, h3]
|
|
21
|
-
#
|
|
22
|
-
# @return [Array] of Hiroiyomi::Html::Element which has been filtered
|
|
23
|
-
def read(url, filter:)
|
|
15
|
+
def read(url, filter:, is_deep: true)
|
|
24
16
|
@open_file = open_url(url)
|
|
25
|
-
do_filter(do_parse(@open_file), filter: filter)
|
|
17
|
+
do_filter(do_parse(@open_file), filter: filter, is_deep: is_deep)
|
|
26
18
|
ensure
|
|
27
19
|
@open_file&.unlink
|
|
28
20
|
end
|
data/lib/hiroiyomi/version.rb
CHANGED
data/lib/hiroiyomi.rb
CHANGED
|
@@ -2,16 +2,17 @@
|
|
|
2
2
|
|
|
3
3
|
require 'hiroiyomi/version'
|
|
4
4
|
require 'hiroiyomi/root'
|
|
5
|
-
require 'hiroiyomi/
|
|
5
|
+
require 'hiroiyomi/html/dom_parser'
|
|
6
6
|
|
|
7
7
|
# Hiroiyomi
|
|
8
8
|
module Hiroiyomi
|
|
9
9
|
# @param [String] url URL
|
|
10
10
|
# @param [Array] filter of filtered by name list, e.g. [h1, h2, h3]
|
|
11
|
+
# @param [Boolean] is_deep Whether result is filtered into children
|
|
11
12
|
#
|
|
12
13
|
# @return [Array] of Hiroiyomi::Html::Element which has been filtered
|
|
13
|
-
def read(url, filter: [])
|
|
14
|
-
|
|
14
|
+
def read(url, filter: [], is_deep: true)
|
|
15
|
+
Html::DOMParser.read(url, filter: filter, is_deep: is_deep)
|
|
15
16
|
end
|
|
16
17
|
|
|
17
18
|
# rubocop:disable Style/AccessModifierDeclarations
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: hiroiyomi
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Tomonori Murakami
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2018-07-
|
|
11
|
+
date: 2018-07-14 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -76,6 +76,7 @@ files:
|
|
|
76
76
|
- ".gitignore"
|
|
77
77
|
- ".rspec"
|
|
78
78
|
- ".rubocop.yml"
|
|
79
|
+
- ".ruby-version"
|
|
79
80
|
- ".travis.yml"
|
|
80
81
|
- Gemfile
|
|
81
82
|
- README.md
|
|
@@ -85,9 +86,12 @@ files:
|
|
|
85
86
|
- hiroiyomi.gemspec
|
|
86
87
|
- lib/hiroiyomi.rb
|
|
87
88
|
- lib/hiroiyomi/html/attribute.rb
|
|
89
|
+
- lib/hiroiyomi/html/childable.rb
|
|
88
90
|
- lib/hiroiyomi/html/document.rb
|
|
91
|
+
- lib/hiroiyomi/html/dom_parser.rb
|
|
92
|
+
- lib/hiroiyomi/html/dom_parser_helper.rb
|
|
89
93
|
- lib/hiroiyomi/html/element.rb
|
|
90
|
-
- lib/hiroiyomi/
|
|
94
|
+
- lib/hiroiyomi/html/text.rb
|
|
91
95
|
- lib/hiroiyomi/parser.rb
|
|
92
96
|
- lib/hiroiyomi/root.rb
|
|
93
97
|
- lib/hiroiyomi/version.rb
|
|
@@ -1,191 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'hiroiyomi/parser'
|
|
4
|
-
require 'hiroiyomi/html/document'
|
|
5
|
-
require 'hiroiyomi/html/element'
|
|
6
|
-
require 'hiroiyomi/html/attribute'
|
|
7
|
-
|
|
8
|
-
module Hiroiyomi
|
|
9
|
-
# HtmlParser
|
|
10
|
-
# rubocop:disable Metrics/ClassLength
|
|
11
|
-
class HtmlParser
|
|
12
|
-
include Parser
|
|
13
|
-
|
|
14
|
-
private
|
|
15
|
-
|
|
16
|
-
def do_parse(file)
|
|
17
|
-
document = Html::Document.new
|
|
18
|
-
return document if file.nil?
|
|
19
|
-
|
|
20
|
-
track_element(file, document)
|
|
21
|
-
end
|
|
22
|
-
|
|
23
|
-
# ========
|
|
24
|
-
# Extract HTML Element
|
|
25
|
-
# ========
|
|
26
|
-
|
|
27
|
-
def track_element(file, document)
|
|
28
|
-
while (c = file.getc)
|
|
29
|
-
break if c == '<' && extract_element(file, document)
|
|
30
|
-
end
|
|
31
|
-
document
|
|
32
|
-
end
|
|
33
|
-
|
|
34
|
-
def extract_element(file, document)
|
|
35
|
-
name = extract_name(file)
|
|
36
|
-
return false if name.empty?
|
|
37
|
-
|
|
38
|
-
attributes = extract_attributes(file)
|
|
39
|
-
element = Html::Element.new(name, attributes: attributes)
|
|
40
|
-
content = extract_content(file, element)
|
|
41
|
-
element.content = content unless content.empty?
|
|
42
|
-
|
|
43
|
-
document.element = element if validate_closing_element?(name, file)
|
|
44
|
-
true
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
# rubocop:disable Metrics/MethodLength
|
|
48
|
-
def extract_name(file, skip_space: false)
|
|
49
|
-
name = ''
|
|
50
|
-
while (c = file.getc)
|
|
51
|
-
case c
|
|
52
|
-
when /[\w-]/
|
|
53
|
-
name += c
|
|
54
|
-
else
|
|
55
|
-
next if skip_space && c =~ /\s/
|
|
56
|
-
file.ungetc(c)
|
|
57
|
-
break
|
|
58
|
-
end
|
|
59
|
-
end
|
|
60
|
-
name
|
|
61
|
-
end
|
|
62
|
-
|
|
63
|
-
# rubocop:enable Metrics/MethodLength
|
|
64
|
-
|
|
65
|
-
def extract_attributes(file)
|
|
66
|
-
attributes = []
|
|
67
|
-
while (attribute = extract_attribute(file))
|
|
68
|
-
attributes.push(attribute)
|
|
69
|
-
end
|
|
70
|
-
attributes
|
|
71
|
-
end
|
|
72
|
-
|
|
73
|
-
# rubocop:disable Metrics/MethodLength, Metrics/CyclomaticComplexity
|
|
74
|
-
def extract_attribute(file)
|
|
75
|
-
name = extract_name(file, skip_space: true)
|
|
76
|
-
return nil if name.empty?
|
|
77
|
-
|
|
78
|
-
value = ''
|
|
79
|
-
open = false
|
|
80
|
-
while (c = file.getc)
|
|
81
|
-
case c
|
|
82
|
-
when '"'
|
|
83
|
-
break if open
|
|
84
|
-
open = true
|
|
85
|
-
else
|
|
86
|
-
value += c if open
|
|
87
|
-
end
|
|
88
|
-
end
|
|
89
|
-
|
|
90
|
-
Html::Attribute.new(name, value.empty? ? nil : value)
|
|
91
|
-
end
|
|
92
|
-
|
|
93
|
-
# rubocop:enable Metrics/MethodLength, Metrics/CyclomaticComplexity
|
|
94
|
-
|
|
95
|
-
# rubocop:disable Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/AbcSize
|
|
96
|
-
def extract_content(file, document)
|
|
97
|
-
content = ''
|
|
98
|
-
close = false
|
|
99
|
-
|
|
100
|
-
append_content = lambda { |str|
|
|
101
|
-
content += str if close
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
while (c = file.getc)
|
|
105
|
-
case c
|
|
106
|
-
when '/'
|
|
107
|
-
# /*<![CDATA[*/!function(e,t,r){function ... ])/*]]>*/
|
|
108
|
-
next_c = file.getc
|
|
109
|
-
if next_c == '*'
|
|
110
|
-
append_content.call(c + next_c)
|
|
111
|
-
content += extract_content_of_cddata(file)
|
|
112
|
-
elsif !close
|
|
113
|
-
file.ungetc(c)
|
|
114
|
-
break
|
|
115
|
-
end
|
|
116
|
-
when '<'
|
|
117
|
-
extract_element(file, document)
|
|
118
|
-
# file.ungetc(c)
|
|
119
|
-
# track_element(file, document)
|
|
120
|
-
close = false
|
|
121
|
-
when '>'
|
|
122
|
-
close ||= true
|
|
123
|
-
else
|
|
124
|
-
append_content.call(c)
|
|
125
|
-
end
|
|
126
|
-
end
|
|
127
|
-
content
|
|
128
|
-
end
|
|
129
|
-
|
|
130
|
-
# /*<![CDATA[*/!function(e,t,r){function ... ])/*]]>*/
|
|
131
|
-
def extract_content_of_cddata(file)
|
|
132
|
-
content = ''
|
|
133
|
-
start_cddata = false
|
|
134
|
-
|
|
135
|
-
append_content = lambda { |str|
|
|
136
|
-
content += str
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
while (c = file.getc)
|
|
140
|
-
case c
|
|
141
|
-
when '/'
|
|
142
|
-
next_c = file.getc
|
|
143
|
-
append_content.call(c + next_c) if next_c == '*'
|
|
144
|
-
when '*' # /*<![CDATA[*/!function(e,t,r){function ... ])/*]]>*/
|
|
145
|
-
next_c = file.getc
|
|
146
|
-
unless next_c == '/'
|
|
147
|
-
file.ungetc(next_c)
|
|
148
|
-
next_c = ''
|
|
149
|
-
end
|
|
150
|
-
start_cddata = !start_cddata
|
|
151
|
-
append_content.call(c + next_c)
|
|
152
|
-
return content unless start_cddata
|
|
153
|
-
else
|
|
154
|
-
append_content.call(c)
|
|
155
|
-
end
|
|
156
|
-
end
|
|
157
|
-
content
|
|
158
|
-
end
|
|
159
|
-
|
|
160
|
-
# rubocop:enable Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/AbcSize
|
|
161
|
-
|
|
162
|
-
def validate_closing_element?(element_name, file)
|
|
163
|
-
open = false
|
|
164
|
-
while (c = file.getc)
|
|
165
|
-
return !open || extract_name(file) == element_name if c == '/'
|
|
166
|
-
open = true if c == '<'
|
|
167
|
-
end
|
|
168
|
-
false
|
|
169
|
-
end
|
|
170
|
-
|
|
171
|
-
# ========
|
|
172
|
-
# Filter HTML Element
|
|
173
|
-
# ========
|
|
174
|
-
|
|
175
|
-
def do_filter(document, filter:)
|
|
176
|
-
filter_element(document, filter, [])
|
|
177
|
-
end
|
|
178
|
-
|
|
179
|
-
def filter_element(element, filter, extracted_elements)
|
|
180
|
-
element.each do |child|
|
|
181
|
-
if filter&.include?(child.name)
|
|
182
|
-
extracted_elements.push(child)
|
|
183
|
-
else
|
|
184
|
-
filter_element(child, filter, extracted_elements)
|
|
185
|
-
end
|
|
186
|
-
end
|
|
187
|
-
extracted_elements
|
|
188
|
-
end
|
|
189
|
-
end
|
|
190
|
-
# rubocop:enable Metrics/ClassLength
|
|
191
|
-
end
|