qnd_html2page 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data.tar.gz.sig +0 -0
- data/lib/qnd_html2page.rb +50 -27
- metadata +2 -2
- metadata.gz.sig +2 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ffa5d9991d46892e27a79ec207484b95b61c846f7cc410c3ec3cf340556f06eb
|
4
|
+
data.tar.gz: 598d0458826f7c293d2d9a0cbdcd31d8e191e37e1bb04520d3bbd73ce5ba952a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 24c2d07d88d7276e57e24cb59ad0240f3ce4e29a0dfbd6ffcefa510502f2342c8d5ffb1045e15165add188d5659d7766415f6763225aecb426c913b68b7bd87e
|
7
|
+
data.tar.gz: ef2828615ea6d377efd93d704b5cbe6c3025a2bc3ba5574fa153571d34b527eab269e1250cba6cbafbd6557f40f86c9ba22c0b9801ae268b8c3501441dbc5b36
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data.tar.gz.sig
CHANGED
Binary file
|
data/lib/qnd_html2page.rb
CHANGED
@@ -8,14 +8,31 @@ require 'tempfile'
|
|
8
8
|
require 'rxfhelper'
|
9
9
|
|
10
10
|
|
11
|
+
module ArraySlices
|
12
|
+
|
13
|
+
refine Array do
|
14
|
+
|
15
|
+
def slice_at(*indices)
|
16
|
+
|
17
|
+
a = indices
|
18
|
+
a << -1 if a[-1] != self[-1]
|
19
|
+
a.unshift -1
|
20
|
+
a.each_cons(2).map {|x1, x2| self.slice(x1+1..x2) }
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
end
|
11
26
|
|
12
27
|
class QndHtml2Page
|
28
|
+
using ArraySlices
|
29
|
+
using ColouredText
|
13
30
|
|
14
31
|
attr_reader :to_pages
|
15
32
|
|
16
|
-
def initialize(html, debug: false, pg_height:
|
33
|
+
def initialize(html, debug: false, pg_height: 770, width: '700px')
|
17
34
|
|
18
|
-
@html, @height, @debug = html, pg_height, debug
|
35
|
+
@html, @height, @width, @debug = html, pg_height, width, debug
|
19
36
|
@to_pages = scan(RXFHelper.read(@html).first)
|
20
37
|
|
21
38
|
end
|
@@ -28,11 +45,12 @@ class QndHtml2Page
|
|
28
45
|
|
29
46
|
doc = Rexle.new(html)
|
30
47
|
body = doc.root.element('body')
|
48
|
+
body.attributes[:style] = 'width: ' + @width
|
31
49
|
|
32
50
|
count = 0
|
33
51
|
body.each_recursive do |e|
|
34
52
|
|
35
|
-
puts 'e: ' + e.
|
53
|
+
puts ('e: ' + e.xml).debug if @debug
|
36
54
|
ignore_list = %w(span b li tr td dt dd em strong i a)
|
37
55
|
next if ignore_list.include? e.name
|
38
56
|
span = Rexle::Element.new('span').add_text(count.to_s)
|
@@ -49,46 +67,51 @@ class QndHtml2Page
|
|
49
67
|
File.write tmpfile.path + '.html', doc.root.xml
|
50
68
|
|
51
69
|
browser = Ferrum::Browser.new
|
70
|
+
#browser.resize width: 300, height: 300
|
52
71
|
browser.goto('file://' + tmpfile.path + '.html')
|
72
|
+
browser.screenshot(path: "/tmp/page.jpg")
|
53
73
|
span_list = browser.xpath('//span')
|
54
74
|
a = span_list.map {|x| [x.text, x.find_position.last] }
|
55
75
|
|
56
|
-
|
76
|
+
|
77
|
+
heights = ((a.last.last) / @height).round.to_i.times.inject([@height]) {|r, x| r << (r.last + @height) }
|
78
|
+
puts ('heights: ' + heights.inspect).debug if @debug
|
79
|
+
height = heights.shift
|
57
80
|
|
58
81
|
a2 = a.inject([[]]) do |r,x|
|
59
82
|
|
60
|
-
puts 'r: ' + x.inspect if @debug
|
61
|
-
puts 'x: ' + x.inspect if @debug
|
62
|
-
|
83
|
+
puts ('r: ' + x.inspect).debug if @debug
|
84
|
+
puts ('x: ' + x.inspect).debug if @debug
|
85
|
+
puts ('height: ' + height.inspect).debug if @debug
|
86
|
+
|
87
|
+
x.last < height ? (r.last << x) : (height = heights.shift; r << [x])
|
63
88
|
r
|
64
89
|
|
65
90
|
end
|
66
91
|
|
67
92
|
|
68
93
|
elements = doc.root.element('body').elements.to_a
|
69
|
-
puts 'elements.length: ' + elements.length.inspect if @debug
|
70
|
-
|
71
|
-
|
72
|
-
puts 'a2: ' + a2.inspect if @debug
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
puts 'e.text: ' + e.text.inspect
|
83
|
-
e.text != id
|
84
|
-
end
|
94
|
+
puts ('elements.length: ' + elements.length.inspect).debug if @debug
|
95
|
+
offset2 = 0
|
96
|
+
|
97
|
+
puts ('a2: ' + a2.inspect).debug if @debug
|
98
|
+
|
99
|
+
# find each last record span stop using the given id
|
100
|
+
stops = a2.map do |x|
|
101
|
+
elements.index(elements.find {|e| e.text == x.last.first })
|
102
|
+
end
|
103
|
+
|
104
|
+
puts ('stops: ' + stops.inspect).debug if @debug
|
105
|
+
|
106
|
+
pages = elements.slice_at(*stops).map do |e_list|
|
85
107
|
|
86
|
-
offset = a3.length
|
87
108
|
div = Rexle::Element.new 'div'
|
88
|
-
|
89
|
-
|
109
|
+
e_list.reject! {|e| e.name == 'span' and e.attributes[:class] == 'qndhtml2pg' }
|
110
|
+
next if e_list.empty?
|
111
|
+
e_list.each {|e| div.add e}
|
112
|
+
|
90
113
|
div
|
91
|
-
end
|
114
|
+
end.compact
|
92
115
|
|
93
116
|
@to_pages = pages
|
94
117
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: qnd_html2page
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
iHadjaBK6nUXzIZ2OCEldp3dzkozgJxu7tcb1Kmr9uAUFoot4w6yl+Kr4JZIW1ml
|
36
36
|
u3jdsCCQLM9dC78qib2fY8vh
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2019-11-
|
38
|
+
date: 2019-11-16 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: rxfhelper
|
metadata.gz.sig
CHANGED
@@ -1 +1,2 @@
|
|
1
|
-
|
1
|
+
�pޛ<��P�1W?��t�7�\p��@��_�Ȓ�D�˟1���z�X��jq8(z։b�:�qS�{�aјCDžt��c�]��ۄ��`��,���w.?F-`ܮQ�4E
|
2
|
+
dCG�Yk�
|