http_crawler 0.2.3.3 → 0.3.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.idea/inspectionProfiles/Project_Default.xml +6 -0
- data/.idea/workspace.xml +68 -101
- data/lib/http_crawler/client.rb +152 -36
- data/lib/http_crawler/http/response.rb +52 -0
- data/lib/http_crawler/http.rb +4 -3
- data/lib/http_crawler/proxy/test_proxy_api/client.rb +1 -1
- data/lib/http_crawler/version.rb +1 -1
- data/lib/http_crawler/web/baidu/client.rb +11 -7
- data/lib/http_crawler.rb +0 -1
- metadata +4 -4
- data/lib/http_crawler/net/http.rb +0 -7
- data/lib/http_crawler/net/response.rb +0 -105
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 870f70e609b513a6bdf5176e1ab2960d19e7b4f04d60c746845e6e27343288c7
|
4
|
+
data.tar.gz: 6706312c08462cb4dee0f8c2eac15b7882131832dc49f6cb336032ffdc0ab68d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3c47791191aca7f3065eee2c9e223e3196f0890a4da8cd4bf8dee697eae18133758ac3225d49503d6ece004bc55474bbd9486b140b2d1588c92ee23d8b06663d
|
7
|
+
data.tar.gz: ff4da1b65de1431b9e6aa800e63d9e0b4c97ee942b68760136deb3bd819c08fc82504159e1a1ea14b6368458e8fa6a7568521a12d0eff259c637e2092791ea6f
|
data/.idea/workspace.xml
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
<project version="4">
|
3
3
|
<component name="ChangeListManager">
|
4
4
|
<list default="true" id="07223dd4-8944-486b-a29b-7461a5c9ec2d" name="Default" comment="">
|
5
|
+
<change afterPath="$PROJECT_DIR$/.idea/inspectionProfiles/Project_Default.xml" afterDir="false" />
|
5
6
|
<change beforePath="$PROJECT_DIR$/.idea/http_crawler.iml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/http_crawler.iml" afterDir="false" />
|
6
7
|
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
7
8
|
<change beforePath="$PROJECT_DIR$/lib/http_crawler/http.rb" beforeDir="false" afterPath="$PROJECT_DIR$/lib/http_crawler/http.rb" afterDir="false" />
|
@@ -17,47 +18,11 @@
|
|
17
18
|
</component>
|
18
19
|
<component name="FileEditorManager">
|
19
20
|
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
|
20
|
-
<file leaf-file-name="version.rb" pinned="false" current-in-tab="true">
|
21
|
-
<entry file="file://$PROJECT_DIR$/lib/http_crawler/version.rb">
|
22
|
-
<provider selected="true" editor-type-id="text-editor">
|
23
|
-
<state relative-caret-position="21">
|
24
|
-
<caret line="1" column="21" selection-start-line="1" selection-start-column="21" selection-end-line="1" selection-end-column="21" />
|
25
|
-
</state>
|
26
|
-
</provider>
|
27
|
-
</entry>
|
28
|
-
</file>
|
29
|
-
<file leaf-file-name="web.rb" pinned="false" current-in-tab="false">
|
30
|
-
<entry file="file://$PROJECT_DIR$/lib/http_crawler/web.rb">
|
31
|
-
<provider selected="true" editor-type-id="text-editor">
|
32
|
-
<state relative-caret-position="126">
|
33
|
-
<caret line="6" selection-start-line="6" selection-end-line="6" />
|
34
|
-
</state>
|
35
|
-
</provider>
|
36
|
-
</entry>
|
37
|
-
</file>
|
38
21
|
<file leaf-file-name="http.rb" pinned="false" current-in-tab="false">
|
39
22
|
<entry file="file://$PROJECT_DIR$/lib/http_crawler/http.rb">
|
40
23
|
<provider selected="true" editor-type-id="text-editor">
|
41
|
-
<state relative-caret-position="
|
42
|
-
<caret line="
|
43
|
-
</state>
|
44
|
-
</provider>
|
45
|
-
</entry>
|
46
|
-
</file>
|
47
|
-
<file leaf-file-name="compat.rb" pinned="false" current-in-tab="false">
|
48
|
-
<entry file="file://$USER_HOME$/.rvm/rubies/ruby-2.4.1/lib/ruby/2.4.0/webrick/compat.rb">
|
49
|
-
<provider selected="true" editor-type-id="text-editor">
|
50
|
-
<state relative-caret-position="378">
|
51
|
-
<caret line="18" column="1" selection-start-line="18" selection-start-column="1" selection-end-line="18" selection-end-column="1" />
|
52
|
-
</state>
|
53
|
-
</provider>
|
54
|
-
</entry>
|
55
|
-
</file>
|
56
|
-
<file leaf-file-name="errno.rb" pinned="false" current-in-tab="false">
|
57
|
-
<entry file="file://$APPLICATION_HOME_DIR$/rubystubs24/errno.rb">
|
58
|
-
<provider selected="true" editor-type-id="text-editor">
|
59
|
-
<state relative-caret-position="672">
|
60
|
-
<caret line="32" column="7" selection-start-line="32" selection-start-column="7" selection-end-line="32" selection-end-column="7" />
|
24
|
+
<state relative-caret-position="504">
|
25
|
+
<caret line="24" column="22" selection-start-line="24" selection-start-column="22" selection-end-line="24" selection-end-column="22" />
|
61
26
|
</state>
|
62
27
|
</provider>
|
63
28
|
</entry>
|
@@ -71,11 +36,11 @@
|
|
71
36
|
</provider>
|
72
37
|
</entry>
|
73
38
|
</file>
|
74
|
-
<file leaf-file-name="
|
75
|
-
<entry file="file://$PROJECT_DIR$/lib/http_crawler/
|
39
|
+
<file leaf-file-name="client.rb" pinned="false" current-in-tab="false">
|
40
|
+
<entry file="file://$PROJECT_DIR$/lib/http_crawler/client.rb">
|
76
41
|
<provider selected="true" editor-type-id="text-editor">
|
77
|
-
<state relative-caret-position="
|
78
|
-
<caret line="
|
42
|
+
<state relative-caret-position="483">
|
43
|
+
<caret line="23" column="19" selection-start-line="23" selection-start-column="19" selection-end-line="23" selection-end-column="19" />
|
79
44
|
</state>
|
80
45
|
</provider>
|
81
46
|
</entry>
|
@@ -83,17 +48,17 @@
|
|
83
48
|
<file leaf-file-name="client.rb" pinned="false" current-in-tab="false">
|
84
49
|
<entry file="file://$PROJECT_DIR$/lib/http_crawler/web/baidu/client.rb">
|
85
50
|
<provider selected="true" editor-type-id="text-editor">
|
86
|
-
<state relative-caret-position="
|
87
|
-
<caret line="
|
51
|
+
<state relative-caret-position="105">
|
52
|
+
<caret line="5" selection-start-line="5" selection-end-line="5" />
|
88
53
|
</state>
|
89
54
|
</provider>
|
90
55
|
</entry>
|
91
56
|
</file>
|
92
|
-
<file leaf-file-name="
|
93
|
-
<entry file="file://$PROJECT_DIR$/
|
57
|
+
<file leaf-file-name="response.rb" pinned="false" current-in-tab="true">
|
58
|
+
<entry file="file://$PROJECT_DIR$/lib/http_crawler/net/response.rb">
|
94
59
|
<provider selected="true" editor-type-id="text-editor">
|
95
|
-
<state relative-caret-position="
|
96
|
-
<caret line="
|
60
|
+
<state relative-caret-position="65">
|
61
|
+
<caret line="4" column="21" selection-start-line="4" selection-start-column="8" selection-end-line="4" selection-end-column="21" />
|
97
62
|
</state>
|
98
63
|
</provider>
|
99
64
|
</entry>
|
@@ -104,6 +69,8 @@
|
|
104
69
|
<findStrings>
|
105
70
|
<find>Crawler::Web</find>
|
106
71
|
<find>"Crawler</find>
|
72
|
+
<find>proxy</find>
|
73
|
+
<find>auto</find>
|
107
74
|
</findStrings>
|
108
75
|
<replaceStrings>
|
109
76
|
<replace>HttpCrawler::Web</replace>
|
@@ -143,8 +110,8 @@
|
|
143
110
|
<option value="$PROJECT_DIR$/lib/http_crawler/web/baidu/client.rb" />
|
144
111
|
<option value="$PROJECT_DIR$/lib/http_crawler/proxy.rb" />
|
145
112
|
<option value="$PROJECT_DIR$/lib/http_crawler/web.rb" />
|
146
|
-
<option value="$PROJECT_DIR$/lib/http_crawler/http.rb" />
|
147
113
|
<option value="$PROJECT_DIR$/lib/http_crawler/version.rb" />
|
114
|
+
<option value="$PROJECT_DIR$/lib/http_crawler/http.rb" />
|
148
115
|
</list>
|
149
116
|
</option>
|
150
117
|
</component>
|
@@ -167,7 +134,6 @@
|
|
167
134
|
<foldersAlwaysOnTop value="true" />
|
168
135
|
</navigator>
|
169
136
|
<panes>
|
170
|
-
<pane id="Scope" />
|
171
137
|
<pane id="ProjectPane">
|
172
138
|
<subPane>
|
173
139
|
<expand>
|
@@ -188,23 +154,23 @@
|
|
188
154
|
</path>
|
189
155
|
<path>
|
190
156
|
<item name="http_crawler" type="b2602c69:ProjectViewProjectNode" />
|
191
|
-
<item name="
|
192
|
-
|
193
|
-
|
194
|
-
<item name="
|
195
|
-
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
|
196
|
-
<item name="< RVM: ruby-2.4.1 >" type="70bed36:NamedLibraryElementNode" />
|
157
|
+
<item name="http_crawler" type="462c0819:PsiDirectoryNode" />
|
158
|
+
<item name="lib" type="462c0819:PsiDirectoryNode" />
|
159
|
+
<item name="http_crawler" type="462c0819:PsiDirectoryNode" />
|
160
|
+
<item name="net" type="462c0819:PsiDirectoryNode" />
|
197
161
|
</path>
|
198
162
|
<path>
|
199
163
|
<item name="http_crawler" type="b2602c69:ProjectViewProjectNode" />
|
200
|
-
<item name="
|
201
|
-
<item name="
|
202
|
-
<item name="
|
164
|
+
<item name="http_crawler" type="462c0819:PsiDirectoryNode" />
|
165
|
+
<item name="lib" type="462c0819:PsiDirectoryNode" />
|
166
|
+
<item name="http_crawler" type="462c0819:PsiDirectoryNode" />
|
167
|
+
<item name="web" type="462c0819:PsiDirectoryNode" />
|
203
168
|
</path>
|
204
169
|
</expand>
|
205
170
|
<select />
|
206
171
|
</subPane>
|
207
172
|
</pane>
|
173
|
+
<pane id="Scope" />
|
208
174
|
</panes>
|
209
175
|
</component>
|
210
176
|
<component name="PropertiesComponent">
|
@@ -251,18 +217,19 @@
|
|
251
217
|
<workItem from="1546240992243" duration="719000" />
|
252
218
|
<workItem from="1546291493927" duration="464000" />
|
253
219
|
<workItem from="1546436457874" duration="2443000" />
|
254
|
-
<workItem from="1549964225949" duration="
|
220
|
+
<workItem from="1549964225949" duration="1209000" />
|
221
|
+
<workItem from="1550132724592" duration="3006000" />
|
222
|
+
<workItem from="1550208979012" duration="304000" />
|
255
223
|
</task>
|
256
224
|
<servers />
|
257
225
|
</component>
|
258
226
|
<component name="TimeTrackingManager">
|
259
|
-
<option name="totallyTimeSpent" value="
|
227
|
+
<option name="totallyTimeSpent" value="27627000" />
|
260
228
|
</component>
|
261
229
|
<component name="ToolWindowManager">
|
262
230
|
<frame x="0" y="0" width="1680" height="1050" extended-state="0" />
|
263
|
-
<editor active="true" />
|
264
231
|
<layout>
|
265
|
-
<window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.
|
232
|
+
<window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.35042736" />
|
266
233
|
<window_info anchor="bottom" id="TODO" order="6" />
|
267
234
|
<window_info anchor="bottom" id="Docker" order="7" show_stripe_button="false" />
|
268
235
|
<window_info anchor="bottom" id="Event Log" order="7" side_tool="true" />
|
@@ -270,7 +237,7 @@
|
|
270
237
|
<window_info anchor="bottom" id="Database Changes" order="7" show_stripe_button="false" />
|
271
238
|
<window_info anchor="bottom" id="Version Control" order="7" />
|
272
239
|
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
|
273
|
-
<window_info
|
240
|
+
<window_info anchor="bottom" id="Terminal" order="7" weight="0.45725647" />
|
274
241
|
<window_info id="Favorites" order="2" side_tool="true" />
|
275
242
|
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
|
276
243
|
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
|
@@ -508,13 +475,6 @@
|
|
508
475
|
</state>
|
509
476
|
</provider>
|
510
477
|
</entry>
|
511
|
-
<entry file="file://$PROJECT_DIR$/lib/http_crawler/net/response.rb">
|
512
|
-
<provider selected="true" editor-type-id="text-editor">
|
513
|
-
<state relative-caret-position="90">
|
514
|
-
<caret line="6" column="45" lean-forward="true" selection-start-line="6" selection-start-column="45" selection-end-line="6" selection-end-column="45" />
|
515
|
-
</state>
|
516
|
-
</provider>
|
517
|
-
</entry>
|
518
478
|
<entry file="file://$PROJECT_DIR$/lib/http_crawler.rb">
|
519
479
|
<provider selected="true" editor-type-id="text-editor">
|
520
480
|
<state relative-caret-position="120">
|
@@ -550,13 +510,6 @@
|
|
550
510
|
</state>
|
551
511
|
</provider>
|
552
512
|
</entry>
|
553
|
-
<entry file="file://$PROJECT_DIR$/lib/http_crawler/client.rb">
|
554
|
-
<provider selected="true" editor-type-id="text-editor">
|
555
|
-
<state relative-caret-position="90">
|
556
|
-
<caret line="6" column="36" selection-start-line="6" selection-start-column="36" selection-end-line="6" selection-end-column="36" />
|
557
|
-
</state>
|
558
|
-
</provider>
|
559
|
-
</entry>
|
560
513
|
<entry file="file://$PROJECT_DIR$/lib/http_crawler/common.rb">
|
561
514
|
<provider selected="true" editor-type-id="text-editor">
|
562
515
|
<state relative-caret-position="42">
|
@@ -564,13 +517,6 @@
|
|
564
517
|
</state>
|
565
518
|
</provider>
|
566
519
|
</entry>
|
567
|
-
<entry file="file://$PROJECT_DIR$/lib/http_crawler/proxy.rb">
|
568
|
-
<provider selected="true" editor-type-id="text-editor">
|
569
|
-
<state relative-caret-position="441">
|
570
|
-
<caret line="21" selection-start-line="21" selection-end-line="21" />
|
571
|
-
</state>
|
572
|
-
</provider>
|
573
|
-
</entry>
|
574
520
|
<entry file="file://$PROJECT_DIR$/lib/http_crawler/web.rb">
|
575
521
|
<provider selected="true" editor-type-id="text-editor">
|
576
522
|
<state relative-caret-position="126">
|
@@ -578,13 +524,6 @@
|
|
578
524
|
</state>
|
579
525
|
</provider>
|
580
526
|
</entry>
|
581
|
-
<entry file="file://$PROJECT_DIR$/lib/http_crawler/web/baidu/client.rb">
|
582
|
-
<provider selected="true" editor-type-id="text-editor">
|
583
|
-
<state relative-caret-position="21">
|
584
|
-
<caret line="1" column="18" selection-start-line="1" selection-start-column="7" selection-end-line="1" selection-end-column="18" />
|
585
|
-
</state>
|
586
|
-
</provider>
|
587
|
-
</entry>
|
588
527
|
<entry file="file://$PROJECT_DIR$/Gemfile">
|
589
528
|
<provider selected="true" editor-type-id="text-editor">
|
590
529
|
<state relative-caret-position="168">
|
@@ -599,13 +538,6 @@
|
|
599
538
|
</state>
|
600
539
|
</provider>
|
601
540
|
</entry>
|
602
|
-
<entry file="file://$PROJECT_DIR$/lib/http_crawler/http.rb">
|
603
|
-
<provider selected="true" editor-type-id="text-editor">
|
604
|
-
<state relative-caret-position="4410">
|
605
|
-
<caret line="210" column="11" lean-forward="true" selection-start-line="210" selection-start-column="11" selection-end-line="210" selection-end-column="11" />
|
606
|
-
</state>
|
607
|
-
</provider>
|
608
|
-
</entry>
|
609
541
|
<entry file="file://$APPLICATION_HOME_DIR$/rubystubs24/system_call_error.rb">
|
610
542
|
<provider selected="true" editor-type-id="text-editor">
|
611
543
|
<state relative-caret-position="150">
|
@@ -622,8 +554,43 @@
|
|
622
554
|
</entry>
|
623
555
|
<entry file="file://$PROJECT_DIR$/lib/http_crawler/version.rb">
|
624
556
|
<provider selected="true" editor-type-id="text-editor">
|
625
|
-
<state relative-caret-position="
|
626
|
-
<caret line="
|
557
|
+
<state relative-caret-position="63">
|
558
|
+
<caret line="3" selection-start-line="3" selection-end-line="3" />
|
559
|
+
</state>
|
560
|
+
</provider>
|
561
|
+
</entry>
|
562
|
+
<entry file="file://$PROJECT_DIR$/lib/http_crawler/proxy.rb">
|
563
|
+
<provider selected="true" editor-type-id="text-editor">
|
564
|
+
<state relative-caret-position="441">
|
565
|
+
<caret line="21" selection-start-line="21" selection-end-line="21" />
|
566
|
+
</state>
|
567
|
+
</provider>
|
568
|
+
</entry>
|
569
|
+
<entry file="file://$PROJECT_DIR$/lib/http_crawler/client.rb">
|
570
|
+
<provider selected="true" editor-type-id="text-editor">
|
571
|
+
<state relative-caret-position="483">
|
572
|
+
<caret line="23" column="19" selection-start-line="23" selection-start-column="19" selection-end-line="23" selection-end-column="19" />
|
573
|
+
</state>
|
574
|
+
</provider>
|
575
|
+
</entry>
|
576
|
+
<entry file="file://$PROJECT_DIR$/lib/http_crawler/web/baidu/client.rb">
|
577
|
+
<provider selected="true" editor-type-id="text-editor">
|
578
|
+
<state relative-caret-position="105">
|
579
|
+
<caret line="5" selection-start-line="5" selection-end-line="5" />
|
580
|
+
</state>
|
581
|
+
</provider>
|
582
|
+
</entry>
|
583
|
+
<entry file="file://$PROJECT_DIR$/lib/http_crawler/http.rb">
|
584
|
+
<provider selected="true" editor-type-id="text-editor">
|
585
|
+
<state relative-caret-position="504">
|
586
|
+
<caret line="24" column="22" selection-start-line="24" selection-start-column="22" selection-end-line="24" selection-end-column="22" />
|
587
|
+
</state>
|
588
|
+
</provider>
|
589
|
+
</entry>
|
590
|
+
<entry file="file://$PROJECT_DIR$/lib/http_crawler/net/response.rb">
|
591
|
+
<provider selected="true" editor-type-id="text-editor">
|
592
|
+
<state relative-caret-position="65">
|
593
|
+
<caret line="4" column="21" selection-start-line="4" selection-start-column="8" selection-end-line="4" selection-end-column="21" />
|
627
594
|
</state>
|
628
595
|
</provider>
|
629
596
|
</entry>
|
data/lib/http_crawler/client.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
load File.dirname(__FILE__) + '/http/response.rb'
|
2
|
+
|
1
3
|
module HttpCrawler
|
2
4
|
module Client
|
3
5
|
|
@@ -21,43 +23,32 @@ module HttpCrawler
|
|
21
23
|
end
|
22
24
|
end
|
23
25
|
|
24
|
-
attr_reader :
|
26
|
+
attr_reader :uri
|
25
27
|
|
26
|
-
#
|
27
28
|
# init_uri 如果未初始化@uri,则会报错
|
28
|
-
#
|
29
|
+
# 继承类需要实现 @uri = URI("http://host")
|
29
30
|
#
|
30
|
-
def
|
31
|
-
|
32
|
-
@http = HttpCrawler::HTTP.new(uri.host, uri.port)
|
33
|
-
|
34
|
-
@http.use_ssl = (uri.scheme == "https")
|
35
|
-
|
36
|
-
@http.open_timeout = 5
|
37
|
-
@http.read_timeout = 5
|
38
|
-
@http.proxy_key = "#{self.class}"
|
39
|
-
init_http
|
40
|
-
|
41
|
-
Rails.logger.debug "proxy_key => #{@http.proxy_key}"
|
42
|
-
end
|
43
|
-
|
44
|
-
# 初始化http参数
|
45
|
-
def init_http
|
46
|
-
|
31
|
+
def init_uri
|
32
|
+
@uri = nil
|
47
33
|
end
|
48
34
|
|
49
|
-
#
|
50
|
-
def
|
51
|
-
@
|
35
|
+
# 初始化超时时间
|
36
|
+
def init_timeout
|
37
|
+
@connect_time = 5
|
38
|
+
@write_time = 2
|
39
|
+
@read_time = 5
|
52
40
|
end
|
53
41
|
|
54
|
-
#
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
42
|
+
# 初始化 ssl 协议
|
43
|
+
def init_ssl
|
44
|
+
if (@uri.scheme == "https")
|
45
|
+
# ssl 协议
|
46
|
+
@ctx = OpenSSL::SSL::SSLContext.new
|
47
|
+
@ctx.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
48
|
+
end
|
59
49
|
end
|
60
50
|
|
51
|
+
# 头文件相关方法
|
61
52
|
def header
|
62
53
|
@header ||= init_header
|
63
54
|
end
|
@@ -70,20 +61,145 @@ module HttpCrawler
|
|
70
61
|
nil
|
71
62
|
end
|
72
63
|
|
73
|
-
|
74
|
-
|
64
|
+
# cookies
|
65
|
+
def cookies
|
66
|
+
@cookies ||= {}
|
75
67
|
end
|
76
68
|
|
69
|
+
|
70
|
+
# 代理设置
|
77
71
|
def auto_proxy=(value)
|
78
72
|
Rails.logger.debug "自动更新代理"
|
79
|
-
@
|
80
|
-
|
73
|
+
@auto_proxy = value
|
74
|
+
update_proxy if (value == true && @proxy.blank?)
|
75
|
+
end
|
76
|
+
|
77
|
+
# 代理使用的api方法名
|
78
|
+
def proxy_api
|
79
|
+
@proxy_api ||= "my"
|
80
|
+
end
|
81
|
+
|
82
|
+
# 调用代理 api使用的参数
|
83
|
+
def proxy_params
|
84
|
+
@proxy_params ||= {"key": "default"}
|
85
|
+
end
|
86
|
+
|
87
|
+
def update_proxy(proxy = {})
|
88
|
+
if (proxy.blank?)
|
89
|
+
@proxy = get_proxy
|
90
|
+
else
|
91
|
+
@proxy = proxy
|
92
|
+
end
|
93
|
+
# @http.update_proxy(proxy)
|
94
|
+
end
|
95
|
+
|
96
|
+
|
97
|
+
# 如果自动更新代理 则更新代理返回 true,否则返回false
|
98
|
+
def update_proxy?(proxy_ip = {})
|
99
|
+
if @auto_proxy
|
100
|
+
update_proxy(proxy_ip)
|
101
|
+
return true
|
102
|
+
else
|
103
|
+
return false
|
104
|
+
end
|
81
105
|
end
|
82
106
|
|
83
|
-
|
84
|
-
|
85
|
-
|
107
|
+
|
108
|
+
# 获取proxy
|
109
|
+
# 通过调用 api 获取代理或者通过自定义设置代理
|
110
|
+
def get_proxy
|
111
|
+
proxy_ip = nil
|
112
|
+
begin
|
113
|
+
Rails.logger.debug("开始获取代理IP")
|
114
|
+
proxy_client = HttpCrawler::Proxy.for(proxy_api)
|
115
|
+
proxy_r = proxy_client.get_proxy(proxy_params)
|
116
|
+
proxy_ip = proxy_r.results unless proxy_r.results.blank?
|
117
|
+
if proxy_ip.blank?
|
118
|
+
Rails.logger.warn "无最新代理等待5秒后重新获取"
|
119
|
+
else
|
120
|
+
break
|
121
|
+
end
|
122
|
+
sleep(5)
|
123
|
+
end while true
|
124
|
+
|
125
|
+
Rails.logger.debug("当前IP => #{@proxy},获取最新代理 => #{proxy_ip}")
|
126
|
+
|
127
|
+
unless proxy_ip && proxy_ip["p_addr"] && proxy_ip["p_port"]
|
128
|
+
Rails.logger.warn "无最新代理等待5秒后重新获取"
|
129
|
+
sleep(5)
|
130
|
+
proxy_ip = get_proxy
|
131
|
+
end
|
132
|
+
|
133
|
+
if (@proxy && proxy_ip && @proxy["p_addr"] == proxy_ip["p_addr"] && @proxy["p_port"] == proxy_ip["p_port"])
|
134
|
+
Rails.logger.warn "无最新代理等待5秒后重新获取"
|
135
|
+
sleep(5)
|
136
|
+
proxy_ip = get_proxy
|
137
|
+
end
|
138
|
+
proxy_ip
|
139
|
+
end
|
140
|
+
|
141
|
+
# 添加错误的url地址,表示这里面的url都是异常地址,存的是正则
|
142
|
+
def add_error_url(url_string)
|
143
|
+
@http.error_urls << url_string
|
144
|
+
end
|
145
|
+
|
146
|
+
|
147
|
+
# 初始化http参数
|
148
|
+
def init_client
|
149
|
+
|
150
|
+
end
|
151
|
+
|
152
|
+
# 初始化http请求前置条件
|
153
|
+
def http
|
154
|
+
# 自动重定向。最大重定向次数 max_hops: 5
|
155
|
+
h = HTTP.follow(max_hops: 5)
|
156
|
+
|
157
|
+
# 添加代理
|
158
|
+
h = h.via(@proxy["p_addr"], @proxy["p_port"].to_i, @proxy["p_user"], @proxy["p_pass"]) unless (@proxy.blank?)
|
159
|
+
|
160
|
+
# 添加头文件
|
161
|
+
h = h.headers(header) if header
|
162
|
+
|
163
|
+
# 添加cookies
|
164
|
+
h = h.cookies(cookies) if cookies
|
165
|
+
|
166
|
+
# 添加超时时间
|
167
|
+
h = h.timeout(connect: @connect_time, write: @write_time, read: @read_time)
|
168
|
+
|
169
|
+
h
|
170
|
+
end
|
171
|
+
|
172
|
+
# 发送 get 请求
|
173
|
+
def get(path, params = {})
|
174
|
+
http.get((@uri + path).to_s, :params => params, :ssl_context => @ctx)
|
175
|
+
end
|
176
|
+
|
177
|
+
# 发送 post 请求
|
178
|
+
def post(path, params = {})
|
179
|
+
http.post((@uri + path).to_s, :form => params, :ssl_context => @ctx)
|
180
|
+
end
|
181
|
+
|
182
|
+
#
|
183
|
+
# init_uri 如果未初始化@uri,则会报错
|
184
|
+
# 继承类需要重定义 init_uri
|
185
|
+
#
|
186
|
+
def initialize
|
187
|
+
# 初始化 uri
|
188
|
+
raise "Client uri为空" unless init_uri
|
189
|
+
|
190
|
+
# 初始化超时时间
|
191
|
+
init_timeout
|
192
|
+
|
193
|
+
# 初始化 ssl 协议
|
194
|
+
init_ssl
|
195
|
+
|
196
|
+
# 初始化一些 client 自定义参数
|
197
|
+
init_client
|
198
|
+
|
199
|
+
# 初始化 代理参数
|
200
|
+
@proxy_params = {key: "#{self.class}"}
|
86
201
|
end
|
87
202
|
|
88
203
|
end
|
89
|
-
end
|
204
|
+
end
|
205
|
+
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module HTTP
|
2
|
+
class Response
|
3
|
+
|
4
|
+
# 解压并转码 body 数据
|
5
|
+
def decoding_body
|
6
|
+
@decoding_body ||= self.to_s
|
7
|
+
end
|
8
|
+
|
9
|
+
# def decoding_body
|
10
|
+
|
11
|
+
def html
|
12
|
+
@html ||= Nokogiri::HTML(decoding_body)
|
13
|
+
end
|
14
|
+
|
15
|
+
def json
|
16
|
+
@json ||= JSON.parse(decoding_body)
|
17
|
+
@json = JSON.parse(@json) if String === @json
|
18
|
+
@json
|
19
|
+
end
|
20
|
+
|
21
|
+
# 通过readability 解析数据
|
22
|
+
def readability
|
23
|
+
@readability ||= Readability::Document.new(decoding_body, {do_not_guess_encoding: true})
|
24
|
+
end
|
25
|
+
|
26
|
+
# 解析
|
27
|
+
def parsing
|
28
|
+
json
|
29
|
+
end
|
30
|
+
|
31
|
+
# 获取解析结果
|
32
|
+
def results
|
33
|
+
@results ||= parsing
|
34
|
+
end
|
35
|
+
|
36
|
+
def get_date(str)
|
37
|
+
time = Time.now
|
38
|
+
case str
|
39
|
+
when /^(\d{1,2})小时前$/
|
40
|
+
time = time - $1.to_i.hours
|
41
|
+
when /^(\d{1,2})月(\d{1,2})日$/
|
42
|
+
time = Time.local(time.year, $1.to_i, $2.to_i)
|
43
|
+
when /^(\d{4})年(\d{1,2})月(\d{1,2})日$/
|
44
|
+
time = Time.local($1.to_i, $2.to_i, $3.to_i)
|
45
|
+
when /^(\d{1,2})月(\d{1,2})日[ ]{0,3}(\d{1,2}):(\d{1,2})$/ # 09月30日 12:04
|
46
|
+
time = Time.local(time.year, $1.to_i, $2.to_i, $3.to_i, $4.to_i)
|
47
|
+
end
|
48
|
+
return time
|
49
|
+
end
|
50
|
+
|
51
|
+
end # class Net::HTTPResponse
|
52
|
+
end
|
data/lib/http_crawler/http.rb
CHANGED
@@ -158,7 +158,9 @@ module HttpCrawler
|
|
158
158
|
response.error!
|
159
159
|
end
|
160
160
|
else
|
161
|
-
|
161
|
+
Rails.logger.debug uri_or_path
|
162
|
+
Rails.logger.debug initheader
|
163
|
+
Rails.logger.debug response.body
|
162
164
|
response.error!
|
163
165
|
end
|
164
166
|
end
|
@@ -255,5 +257,4 @@ module HttpCrawler
|
|
255
257
|
end
|
256
258
|
|
257
259
|
|
258
|
-
load File.dirname(__FILE__) + '/
|
259
|
-
load File.dirname(__FILE__) + '/net/response.rb'
|
260
|
+
load File.dirname(__FILE__) + '/http/response.rb'
|
@@ -22,7 +22,7 @@ module HttpCrawler
|
|
22
22
|
|
23
23
|
# http://39.108.59.38:7772/Tools/proxyIP.ashx?OrderNumber=ccd4c8912691f28861a1ed048fec88dc&poolIndex=22717&cache=1&qty=2
|
24
24
|
def get_proxy(parameter = {})
|
25
|
-
r = http.
|
25
|
+
r = http.get("/api/get_proxy")
|
26
26
|
r.extend(HttpCrawler::Proxy::TestProxyApi::Response::GetProxy)
|
27
27
|
end
|
28
28
|
|
data/lib/http_crawler/version.rb
CHANGED
@@ -6,20 +6,24 @@ module HttpCrawler
|
|
6
6
|
|
7
7
|
include(HttpCrawler::Client)
|
8
8
|
|
9
|
-
def init_http
|
10
|
-
@http.open_timeout = 3
|
11
|
-
@http.read_timeout = 3
|
12
|
-
end
|
13
|
-
|
14
9
|
def init_uri
|
15
|
-
@uri = URI("https://www.baidu.com
|
10
|
+
@uri = URI("https://www.baidu.com")
|
16
11
|
end
|
17
12
|
|
18
13
|
def index(parameter = {})
|
19
|
-
r =
|
14
|
+
r = get("/")
|
20
15
|
r.extend(HttpCrawler::Web::Baidu::Response::Index)
|
21
16
|
end
|
22
17
|
|
18
|
+
def search(parameter = {})
|
19
|
+
raise "parameter[:keyword] 不能为空" unless parameter[:keyword]
|
20
|
+
params = {
|
21
|
+
"wd": parameter[:keyword]
|
22
|
+
}
|
23
|
+
r = get("/s",params)
|
24
|
+
r
|
25
|
+
end
|
26
|
+
|
23
27
|
end
|
24
28
|
end # module Baidu
|
25
29
|
end # module Web
|
data/lib/http_crawler.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: http_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- jagger
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-02-
|
11
|
+
date: 2019-02-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -118,6 +118,7 @@ files:
|
|
118
118
|
- ".gitignore"
|
119
119
|
- ".idea/.rakeTasks"
|
120
120
|
- ".idea/http_crawler.iml"
|
121
|
+
- ".idea/inspectionProfiles/Project_Default.xml"
|
121
122
|
- ".idea/misc.xml"
|
122
123
|
- ".idea/modules.xml"
|
123
124
|
- ".idea/vcs.xml"
|
@@ -136,8 +137,7 @@ files:
|
|
136
137
|
- lib/http_crawler/common/object.rb
|
137
138
|
- lib/http_crawler/common/string.rb
|
138
139
|
- lib/http_crawler/http.rb
|
139
|
-
- lib/http_crawler/
|
140
|
-
- lib/http_crawler/net/response.rb
|
140
|
+
- lib/http_crawler/http/response.rb
|
141
141
|
- lib/http_crawler/proxy.rb
|
142
142
|
- lib/http_crawler/proxy/README.md
|
143
143
|
- lib/http_crawler/proxy/client.rb
|
@@ -1,105 +0,0 @@
|
|
1
|
-
module Net
|
2
|
-
class HTTPResponse
|
3
|
-
|
4
|
-
# 解压并转码 body 数据
|
5
|
-
def decoding_body
|
6
|
-
|
7
|
-
return @decoding_body if @decoding_body
|
8
|
-
return nil unless body
|
9
|
-
|
10
|
-
# 数据解压
|
11
|
-
case header['Content-Encoding']
|
12
|
-
when 'gzip' then
|
13
|
-
sio = StringIO.new(body)
|
14
|
-
gz = Zlib::GzipReader.new(sio)
|
15
|
-
@decoding_body = gz.read()
|
16
|
-
when 'br'
|
17
|
-
@decoding_body = Brotli.inflate(body)
|
18
|
-
when 'deflate'
|
19
|
-
# 可能错误代码 暂时没解决 deflate 编码格式
|
20
|
-
@decoding_body = Zlib::Inflate.inflate(body)
|
21
|
-
else
|
22
|
-
@decoding_body = body
|
23
|
-
end
|
24
|
-
|
25
|
-
# 判断解压后数据编码格式
|
26
|
-
|
27
|
-
# 从header取编码格式
|
28
|
-
encoding = header['Content-Type'][/charset=([^, ;"]*)/, 1] if header['Content-Type']
|
29
|
-
|
30
|
-
# 从html中的 charset 取编码格式
|
31
|
-
encoding = @decoding_body[/charset=([^, ;"]*)/, 1] unless encoding
|
32
|
-
|
33
|
-
# 通过 CharDet 判断编码格式
|
34
|
-
encoding = CharDet.detect(@decoding_body)["encoding"] unless encoding
|
35
|
-
|
36
|
-
|
37
|
-
# 进行转码
|
38
|
-
begin
|
39
|
-
@decoding_body.force_encoding(encoding).encode!('utf-8') if encoding && encoding != @decoding_body.encoding
|
40
|
-
rescue => e
|
41
|
-
# 转码错误后再次使用 CharDet 判断编码格式后进行转码
|
42
|
-
cd = CharDet.detect(@decoding_body)["encoding"]
|
43
|
-
if (cd && cd != encoding)
|
44
|
-
@decoding_body.force_encoding(cd).encode!('utf-8') if encoding != @decoding_body.encoding
|
45
|
-
else
|
46
|
-
# 还是转码错误则抛出异常
|
47
|
-
Rails.logger.debug "encoding => #{encoding}"
|
48
|
-
Rails.logger.debug "cd => #{cd}"
|
49
|
-
Rails.logger.debug "@decoding_body[0..200] => #{@decoding_body[0..200]}"
|
50
|
-
raise e
|
51
|
-
end
|
52
|
-
end
|
53
|
-
|
54
|
-
@decoding_body
|
55
|
-
end
|
56
|
-
|
57
|
-
# def decoding_body
|
58
|
-
|
59
|
-
def html
|
60
|
-
@html ||= Nokogiri::HTML(decoding_body)
|
61
|
-
end
|
62
|
-
|
63
|
-
def json
|
64
|
-
@json ||= JSON.parse(decoding_body)
|
65
|
-
@json = JSON.parse(@json) if String === @json
|
66
|
-
@json
|
67
|
-
end
|
68
|
-
|
69
|
-
# 通过readability 解析数据
|
70
|
-
def readability
|
71
|
-
@readability ||= Readability::Document.new(decoding_body, {do_not_guess_encoding: true})
|
72
|
-
end
|
73
|
-
|
74
|
-
# 解析
|
75
|
-
def parsing
|
76
|
-
json
|
77
|
-
end
|
78
|
-
|
79
|
-
# 获取解析结果
|
80
|
-
def results
|
81
|
-
@results ||= parsing
|
82
|
-
end
|
83
|
-
|
84
|
-
def get_date(str)
|
85
|
-
time = Time.now
|
86
|
-
case str
|
87
|
-
when /^(\d{1,2})小时前$/
|
88
|
-
time = time - $1.to_i.hours
|
89
|
-
when /^(\d{1,2})月(\d{1,2})日$/
|
90
|
-
time = Time.local(time.year, $1.to_i, $2.to_i)
|
91
|
-
when /^(\d{4})年(\d{1,2})月(\d{1,2})日$/
|
92
|
-
time = Time.local($1.to_i, $2.to_i, $3.to_i)
|
93
|
-
when /^(\d{1,2})月(\d{1,2})日[ ]{0,3}(\d{1,2}):(\d{1,2})$/ # 09月30日 12:04
|
94
|
-
time = Time.local(time.year, $1.to_i, $2.to_i, $3.to_i, $4.to_i)
|
95
|
-
end
|
96
|
-
return time
|
97
|
-
end
|
98
|
-
|
99
|
-
|
100
|
-
# 是否是网站验证 true表示正常数据、false表示弹出网站验证
|
101
|
-
def web_verify(*arg)
|
102
|
-
true
|
103
|
-
end
|
104
|
-
end # class Net::HTTPResponse
|
105
|
-
end
|