webg 0.1.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/exe/webg +61 -14
- data/lib/webg/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 941940d059cf78116a42cdf6733bea9dd14a243e1fcde8773494a6eabb3c5e43
|
4
|
+
data.tar.gz: c2188442e6ab0f3cefea91d0a900259fcf254184b9fe38076a34899e4b924196
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 35f501258470789b85c1ea9543a611ae35d1e40ac35317fdf6fd0a139e1111ed7e7772ecba1f7fd7a7a7feef5eb27e602333cc9da1ade6ebc1bcaea051c357df
|
7
|
+
data.tar.gz: cfb7b398c2c399b79bac1d7ffa22b15f102da564a94c22a9fabd7bbe98af41926e3c9e5b6dc648feef2eb06faea778c99d22bf1c84b6d8082942616f5b8abc0e
|
data/exe/webg
CHANGED
@@ -7,17 +7,36 @@ require "nokogiri"
|
|
7
7
|
|
8
8
|
require "webg/version"
|
9
9
|
|
10
|
+
module Squeezable
|
11
|
+
def squeezed_text(document)
|
12
|
+
return text(document).each_line.map { |l|
|
13
|
+
l.gsub(/\p{Space}+/, " ").strip
|
14
|
+
}.join("\n").strip.gsub(/\n{3,}/, "\n\n")
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
10
18
|
module Fetcher
|
11
19
|
end
|
12
20
|
|
13
21
|
class Fetcher::Raw
|
22
|
+
DEFAULT_USER_AGENT = "webg/#{Webg::VERSION}"
|
23
|
+
|
24
|
+
def initialize(headers)
|
25
|
+
@headers = headers
|
26
|
+
end
|
27
|
+
|
14
28
|
def call(uri)
|
15
29
|
require("open-uri")
|
16
|
-
return uri.read
|
30
|
+
return uri.read({"User-Agent" => DEFAULT_USER_AGENT}.merge(@headers))
|
17
31
|
end
|
18
32
|
end
|
19
33
|
|
20
34
|
class Fetcher::Firefox
|
35
|
+
def initialize(headers)
|
36
|
+
return if headers.empty?
|
37
|
+
raise "specifying headers on Firefox is not supported"
|
38
|
+
end
|
39
|
+
|
21
40
|
def call(uri)
|
22
41
|
require("capybara")
|
23
42
|
session = Capybara::Session.new(:selenium_headless)
|
@@ -30,6 +49,8 @@ module Selector
|
|
30
49
|
end
|
31
50
|
|
32
51
|
class Selector::All
|
52
|
+
include Squeezable
|
53
|
+
|
33
54
|
REJECT_TAG_NAMES = %w[script noscript style]
|
34
55
|
|
35
56
|
def raw(document)
|
@@ -43,32 +64,33 @@ class Selector::All
|
|
43
64
|
end
|
44
65
|
|
45
66
|
class Selector::Css
|
67
|
+
include Squeezable
|
68
|
+
|
46
69
|
def initialize(css_selectors)
|
47
70
|
@css_selectors = css_selectors
|
48
71
|
end
|
49
72
|
|
50
73
|
def raw(document)
|
51
|
-
return
|
52
|
-
node.evaluate_script('this.outerHTML')
|
53
|
-
}.join("\n")
|
74
|
+
return process(document, :to_s)
|
54
75
|
end
|
55
76
|
|
56
|
-
def text(
|
57
|
-
return
|
77
|
+
def text(document)
|
78
|
+
return process(document, :text)
|
58
79
|
end
|
59
80
|
|
60
81
|
private
|
61
82
|
|
62
|
-
def
|
63
|
-
return
|
83
|
+
def process(document, method)
|
84
|
+
return document.css(@css_selectors.join(", ")).map(&method).join("\n")
|
64
85
|
end
|
65
86
|
end
|
66
87
|
|
67
88
|
def parse_options(argv)
|
68
89
|
argv = argv.dup
|
69
90
|
fetcher = :raw
|
91
|
+
headers = {}
|
70
92
|
css_selectors = []
|
71
|
-
|
93
|
+
output_method_name = :raw
|
72
94
|
|
73
95
|
parser = OptionParser.new
|
74
96
|
parser.version = Webg::VERSION
|
@@ -81,6 +103,26 @@ def parse_options(argv)
|
|
81
103
|
) do
|
82
104
|
fetcher = :firefox
|
83
105
|
end
|
106
|
+
parser.on(
|
107
|
+
"--user-agent=USER-AGENT",
|
108
|
+
"specify User-Agent header",
|
109
|
+
) do |ua|
|
110
|
+
headers["User-Agent"] = ua
|
111
|
+
end
|
112
|
+
parser.on(
|
113
|
+
"--referer=REFERER",
|
114
|
+
"specify Referer header",
|
115
|
+
) do |referer|
|
116
|
+
headers["Referer"] = referer
|
117
|
+
end
|
118
|
+
parser.on(
|
119
|
+
"--header=HEADER-LINE",
|
120
|
+
"specify various headers in HTTP request. e.g: --header='Accept-Language: ja'"
|
121
|
+
) do |header_line|
|
122
|
+
md = /:\s+/.match(header_line)
|
123
|
+
raise "cannot parse header-line(#{header_line})" if !md
|
124
|
+
headers[md.pre_match] = md.post_match
|
125
|
+
end
|
84
126
|
parser.on(
|
85
127
|
"--css-selector=SELECTOR",
|
86
128
|
"specify css selector to filter output.",
|
@@ -91,7 +133,13 @@ def parse_options(argv)
|
|
91
133
|
"--text",
|
92
134
|
"output only text",
|
93
135
|
) do
|
94
|
-
|
136
|
+
output_method_name = :text
|
137
|
+
end
|
138
|
+
parser.on(
|
139
|
+
"--squeezed-text",
|
140
|
+
"output text with squeezing white spaces",
|
141
|
+
) do
|
142
|
+
output_method_name = :squeezed_text
|
95
143
|
end
|
96
144
|
parser.parse!(argv)
|
97
145
|
|
@@ -102,15 +150,14 @@ def parse_options(argv)
|
|
102
150
|
end
|
103
151
|
uri = URI(uri)
|
104
152
|
|
105
|
-
return uri, fetcher, css_selectors,
|
153
|
+
return uri, fetcher, headers, css_selectors, output_method_name
|
106
154
|
end
|
107
155
|
|
108
156
|
begin
|
109
|
-
uri, fetcher_name, css_selectors,
|
157
|
+
uri, fetcher_name, headers, css_selectors, output_method_name = parse_options(ARGV)
|
110
158
|
|
111
|
-
fetcher = Fetcher.const_get(fetcher_name.capitalize).new
|
159
|
+
fetcher = Fetcher.const_get(fetcher_name.capitalize).new(headers)
|
112
160
|
selector = css_selectors.empty? ? Selector::All.new : Selector::Css.new(css_selectors)
|
113
|
-
output_method_name = text ? :text : :raw
|
114
161
|
|
115
162
|
document = Nokogiri::HTML.parse(fetcher.(uri))
|
116
163
|
puts(selector.public_send(output_method_name, document))
|
data/lib/webg/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yuya.Nishida.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-07-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|
@@ -77,7 +77,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
77
77
|
- !ruby/object:Gem::Version
|
78
78
|
version: '0'
|
79
79
|
requirements: []
|
80
|
-
rubygems_version: 3.
|
80
|
+
rubygems_version: 3.3.7
|
81
81
|
signing_key:
|
82
82
|
specification_version: 4
|
83
83
|
summary: 'webg: A downloader to get web page with JavaScript'
|