webg 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/exe/webg +61 -14
- data/lib/webg/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 941940d059cf78116a42cdf6733bea9dd14a243e1fcde8773494a6eabb3c5e43
|
4
|
+
data.tar.gz: c2188442e6ab0f3cefea91d0a900259fcf254184b9fe38076a34899e4b924196
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 35f501258470789b85c1ea9543a611ae35d1e40ac35317fdf6fd0a139e1111ed7e7772ecba1f7fd7a7a7feef5eb27e602333cc9da1ade6ebc1bcaea051c357df
|
7
|
+
data.tar.gz: cfb7b398c2c399b79bac1d7ffa22b15f102da564a94c22a9fabd7bbe98af41926e3c9e5b6dc648feef2eb06faea778c99d22bf1c84b6d8082942616f5b8abc0e
|
data/exe/webg
CHANGED
@@ -7,17 +7,36 @@ require "nokogiri"
|
|
7
7
|
|
8
8
|
require "webg/version"
|
9
9
|
|
10
|
+
module Squeezable
|
11
|
+
def squeezed_text(document)
|
12
|
+
return text(document).each_line.map { |l|
|
13
|
+
l.gsub(/\p{Space}+/, " ").strip
|
14
|
+
}.join("\n").strip.gsub(/\n{3,}/, "\n\n")
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
10
18
|
module Fetcher
|
11
19
|
end
|
12
20
|
|
13
21
|
class Fetcher::Raw
|
22
|
+
DEFAULT_USER_AGENT = "webg/#{Webg::VERSION}"
|
23
|
+
|
24
|
+
def initialize(headers)
|
25
|
+
@headers = headers
|
26
|
+
end
|
27
|
+
|
14
28
|
def call(uri)
|
15
29
|
require("open-uri")
|
16
|
-
return uri.read
|
30
|
+
return uri.read({"User-Agent" => DEFAULT_USER_AGENT}.merge(@headers))
|
17
31
|
end
|
18
32
|
end
|
19
33
|
|
20
34
|
class Fetcher::Firefox
|
35
|
+
def initialize(headers)
|
36
|
+
return if headers.empty?
|
37
|
+
raise "specifying headers on Firefox is not supported"
|
38
|
+
end
|
39
|
+
|
21
40
|
def call(uri)
|
22
41
|
require("capybara")
|
23
42
|
session = Capybara::Session.new(:selenium_headless)
|
@@ -30,6 +49,8 @@ module Selector
|
|
30
49
|
end
|
31
50
|
|
32
51
|
class Selector::All
|
52
|
+
include Squeezable
|
53
|
+
|
33
54
|
REJECT_TAG_NAMES = %w[script noscript style]
|
34
55
|
|
35
56
|
def raw(document)
|
@@ -43,32 +64,33 @@ class Selector::All
|
|
43
64
|
end
|
44
65
|
|
45
66
|
class Selector::Css
|
67
|
+
include Squeezable
|
68
|
+
|
46
69
|
def initialize(css_selectors)
|
47
70
|
@css_selectors = css_selectors
|
48
71
|
end
|
49
72
|
|
50
73
|
def raw(document)
|
51
|
-
return
|
52
|
-
node.evaluate_script('this.outerHTML')
|
53
|
-
}.join("\n")
|
74
|
+
return process(document, :to_s)
|
54
75
|
end
|
55
76
|
|
56
|
-
def text(
|
57
|
-
return
|
77
|
+
def text(document)
|
78
|
+
return process(document, :text)
|
58
79
|
end
|
59
80
|
|
60
81
|
private
|
61
82
|
|
62
|
-
def
|
63
|
-
return
|
83
|
+
def process(document, method)
|
84
|
+
return document.css(@css_selectors.join(", ")).map(&method).join("\n")
|
64
85
|
end
|
65
86
|
end
|
66
87
|
|
67
88
|
def parse_options(argv)
|
68
89
|
argv = argv.dup
|
69
90
|
fetcher = :raw
|
91
|
+
headers = {}
|
70
92
|
css_selectors = []
|
71
|
-
|
93
|
+
output_method_name = :raw
|
72
94
|
|
73
95
|
parser = OptionParser.new
|
74
96
|
parser.version = Webg::VERSION
|
@@ -81,6 +103,26 @@ def parse_options(argv)
|
|
81
103
|
) do
|
82
104
|
fetcher = :firefox
|
83
105
|
end
|
106
|
+
parser.on(
|
107
|
+
"--user-agent=USER-AGENT",
|
108
|
+
"specify User-Agent header",
|
109
|
+
) do |ua|
|
110
|
+
headers["User-Agent"] = ua
|
111
|
+
end
|
112
|
+
parser.on(
|
113
|
+
"--referer=REFERER",
|
114
|
+
"specify Referer header",
|
115
|
+
) do |referer|
|
116
|
+
headers["Referer"] = referer
|
117
|
+
end
|
118
|
+
parser.on(
|
119
|
+
"--header=HEADER-LINE",
|
120
|
+
"specify various headers in HTTP request. e.g: --header='Accept-Language: ja'"
|
121
|
+
) do |header_line|
|
122
|
+
md = /:\s+/.match(header_line)
|
123
|
+
raise "cannot parse header-line(#{header_line})" if !md
|
124
|
+
headers[md.pre_match] = md.post_match
|
125
|
+
end
|
84
126
|
parser.on(
|
85
127
|
"--css-selector=SELECTOR",
|
86
128
|
"specify css selector to filter output.",
|
@@ -91,7 +133,13 @@ def parse_options(argv)
|
|
91
133
|
"--text",
|
92
134
|
"output only text",
|
93
135
|
) do
|
94
|
-
|
136
|
+
output_method_name = :text
|
137
|
+
end
|
138
|
+
parser.on(
|
139
|
+
"--squeezed-text",
|
140
|
+
"output text with squeezing white spaces",
|
141
|
+
) do
|
142
|
+
output_method_name = :squeezed_text
|
95
143
|
end
|
96
144
|
parser.parse!(argv)
|
97
145
|
|
@@ -102,15 +150,14 @@ def parse_options(argv)
|
|
102
150
|
end
|
103
151
|
uri = URI(uri)
|
104
152
|
|
105
|
-
return uri, fetcher, css_selectors,
|
153
|
+
return uri, fetcher, headers, css_selectors, output_method_name
|
106
154
|
end
|
107
155
|
|
108
156
|
begin
|
109
|
-
uri, fetcher_name, css_selectors,
|
157
|
+
uri, fetcher_name, headers, css_selectors, output_method_name = parse_options(ARGV)
|
110
158
|
|
111
|
-
fetcher = Fetcher.const_get(fetcher_name.capitalize).new
|
159
|
+
fetcher = Fetcher.const_get(fetcher_name.capitalize).new(headers)
|
112
160
|
selector = css_selectors.empty? ? Selector::All.new : Selector::Css.new(css_selectors)
|
113
|
-
output_method_name = text ? :text : :raw
|
114
161
|
|
115
162
|
document = Nokogiri::HTML.parse(fetcher.(uri))
|
116
163
|
puts(selector.public_send(output_method_name, document))
|
data/lib/webg/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yuya.Nishida.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-07-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|
@@ -77,7 +77,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
77
77
|
- !ruby/object:Gem::Version
|
78
78
|
version: '0'
|
79
79
|
requirements: []
|
80
|
-
rubygems_version: 3.
|
80
|
+
rubygems_version: 3.3.7
|
81
81
|
signing_key:
|
82
82
|
specification_version: 4
|
83
83
|
summary: 'webg: A downloader to get web page with JavaScript'
|