http_crawler 0.2.3.3 → 0.3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.idea/inspectionProfiles/Project_Default.xml +6 -0
- data/.idea/workspace.xml +68 -101
- data/lib/http_crawler/client.rb +152 -36
- data/lib/http_crawler/http/response.rb +52 -0
- data/lib/http_crawler/http.rb +4 -3
- data/lib/http_crawler/proxy/test_proxy_api/client.rb +1 -1
- data/lib/http_crawler/version.rb +1 -1
- data/lib/http_crawler/web/baidu/client.rb +11 -7
- data/lib/http_crawler.rb +0 -1
- metadata +4 -4
- data/lib/http_crawler/net/http.rb +0 -7
- data/lib/http_crawler/net/response.rb +0 -105
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 870f70e609b513a6bdf5176e1ab2960d19e7b4f04d60c746845e6e27343288c7
|
|
4
|
+
data.tar.gz: 6706312c08462cb4dee0f8c2eac15b7882131832dc49f6cb336032ffdc0ab68d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 3c47791191aca7f3065eee2c9e223e3196f0890a4da8cd4bf8dee697eae18133758ac3225d49503d6ece004bc55474bbd9486b140b2d1588c92ee23d8b06663d
|
|
7
|
+
data.tar.gz: ff4da1b65de1431b9e6aa800e63d9e0b4c97ee942b68760136deb3bd819c08fc82504159e1a1ea14b6368458e8fa6a7568521a12d0eff259c637e2092791ea6f
|
data/.idea/workspace.xml
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
<project version="4">
|
|
3
3
|
<component name="ChangeListManager">
|
|
4
4
|
<list default="true" id="07223dd4-8944-486b-a29b-7461a5c9ec2d" name="Default" comment="">
|
|
5
|
+
<change afterPath="$PROJECT_DIR$/.idea/inspectionProfiles/Project_Default.xml" afterDir="false" />
|
|
5
6
|
<change beforePath="$PROJECT_DIR$/.idea/http_crawler.iml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/http_crawler.iml" afterDir="false" />
|
|
6
7
|
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
|
7
8
|
<change beforePath="$PROJECT_DIR$/lib/http_crawler/http.rb" beforeDir="false" afterPath="$PROJECT_DIR$/lib/http_crawler/http.rb" afterDir="false" />
|
|
@@ -17,47 +18,11 @@
|
|
|
17
18
|
</component>
|
|
18
19
|
<component name="FileEditorManager">
|
|
19
20
|
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
|
|
20
|
-
<file leaf-file-name="version.rb" pinned="false" current-in-tab="true">
|
|
21
|
-
<entry file="file://$PROJECT_DIR$/lib/http_crawler/version.rb">
|
|
22
|
-
<provider selected="true" editor-type-id="text-editor">
|
|
23
|
-
<state relative-caret-position="21">
|
|
24
|
-
<caret line="1" column="21" selection-start-line="1" selection-start-column="21" selection-end-line="1" selection-end-column="21" />
|
|
25
|
-
</state>
|
|
26
|
-
</provider>
|
|
27
|
-
</entry>
|
|
28
|
-
</file>
|
|
29
|
-
<file leaf-file-name="web.rb" pinned="false" current-in-tab="false">
|
|
30
|
-
<entry file="file://$PROJECT_DIR$/lib/http_crawler/web.rb">
|
|
31
|
-
<provider selected="true" editor-type-id="text-editor">
|
|
32
|
-
<state relative-caret-position="126">
|
|
33
|
-
<caret line="6" selection-start-line="6" selection-end-line="6" />
|
|
34
|
-
</state>
|
|
35
|
-
</provider>
|
|
36
|
-
</entry>
|
|
37
|
-
</file>
|
|
38
21
|
<file leaf-file-name="http.rb" pinned="false" current-in-tab="false">
|
|
39
22
|
<entry file="file://$PROJECT_DIR$/lib/http_crawler/http.rb">
|
|
40
23
|
<provider selected="true" editor-type-id="text-editor">
|
|
41
|
-
<state relative-caret-position="
|
|
42
|
-
<caret line="
|
|
43
|
-
</state>
|
|
44
|
-
</provider>
|
|
45
|
-
</entry>
|
|
46
|
-
</file>
|
|
47
|
-
<file leaf-file-name="compat.rb" pinned="false" current-in-tab="false">
|
|
48
|
-
<entry file="file://$USER_HOME$/.rvm/rubies/ruby-2.4.1/lib/ruby/2.4.0/webrick/compat.rb">
|
|
49
|
-
<provider selected="true" editor-type-id="text-editor">
|
|
50
|
-
<state relative-caret-position="378">
|
|
51
|
-
<caret line="18" column="1" selection-start-line="18" selection-start-column="1" selection-end-line="18" selection-end-column="1" />
|
|
52
|
-
</state>
|
|
53
|
-
</provider>
|
|
54
|
-
</entry>
|
|
55
|
-
</file>
|
|
56
|
-
<file leaf-file-name="errno.rb" pinned="false" current-in-tab="false">
|
|
57
|
-
<entry file="file://$APPLICATION_HOME_DIR$/rubystubs24/errno.rb">
|
|
58
|
-
<provider selected="true" editor-type-id="text-editor">
|
|
59
|
-
<state relative-caret-position="672">
|
|
60
|
-
<caret line="32" column="7" selection-start-line="32" selection-start-column="7" selection-end-line="32" selection-end-column="7" />
|
|
24
|
+
<state relative-caret-position="504">
|
|
25
|
+
<caret line="24" column="22" selection-start-line="24" selection-start-column="22" selection-end-line="24" selection-end-column="22" />
|
|
61
26
|
</state>
|
|
62
27
|
</provider>
|
|
63
28
|
</entry>
|
|
@@ -71,11 +36,11 @@
|
|
|
71
36
|
</provider>
|
|
72
37
|
</entry>
|
|
73
38
|
</file>
|
|
74
|
-
<file leaf-file-name="
|
|
75
|
-
<entry file="file://$PROJECT_DIR$/lib/http_crawler/
|
|
39
|
+
<file leaf-file-name="client.rb" pinned="false" current-in-tab="false">
|
|
40
|
+
<entry file="file://$PROJECT_DIR$/lib/http_crawler/client.rb">
|
|
76
41
|
<provider selected="true" editor-type-id="text-editor">
|
|
77
|
-
<state relative-caret-position="
|
|
78
|
-
<caret line="
|
|
42
|
+
<state relative-caret-position="483">
|
|
43
|
+
<caret line="23" column="19" selection-start-line="23" selection-start-column="19" selection-end-line="23" selection-end-column="19" />
|
|
79
44
|
</state>
|
|
80
45
|
</provider>
|
|
81
46
|
</entry>
|
|
@@ -83,17 +48,17 @@
|
|
|
83
48
|
<file leaf-file-name="client.rb" pinned="false" current-in-tab="false">
|
|
84
49
|
<entry file="file://$PROJECT_DIR$/lib/http_crawler/web/baidu/client.rb">
|
|
85
50
|
<provider selected="true" editor-type-id="text-editor">
|
|
86
|
-
<state relative-caret-position="
|
|
87
|
-
<caret line="
|
|
51
|
+
<state relative-caret-position="105">
|
|
52
|
+
<caret line="5" selection-start-line="5" selection-end-line="5" />
|
|
88
53
|
</state>
|
|
89
54
|
</provider>
|
|
90
55
|
</entry>
|
|
91
56
|
</file>
|
|
92
|
-
<file leaf-file-name="
|
|
93
|
-
<entry file="file://$PROJECT_DIR$/
|
|
57
|
+
<file leaf-file-name="response.rb" pinned="false" current-in-tab="true">
|
|
58
|
+
<entry file="file://$PROJECT_DIR$/lib/http_crawler/net/response.rb">
|
|
94
59
|
<provider selected="true" editor-type-id="text-editor">
|
|
95
|
-
<state relative-caret-position="
|
|
96
|
-
<caret line="
|
|
60
|
+
<state relative-caret-position="65">
|
|
61
|
+
<caret line="4" column="21" selection-start-line="4" selection-start-column="8" selection-end-line="4" selection-end-column="21" />
|
|
97
62
|
</state>
|
|
98
63
|
</provider>
|
|
99
64
|
</entry>
|
|
@@ -104,6 +69,8 @@
|
|
|
104
69
|
<findStrings>
|
|
105
70
|
<find>Crawler::Web</find>
|
|
106
71
|
<find>"Crawler</find>
|
|
72
|
+
<find>proxy</find>
|
|
73
|
+
<find>auto</find>
|
|
107
74
|
</findStrings>
|
|
108
75
|
<replaceStrings>
|
|
109
76
|
<replace>HttpCrawler::Web</replace>
|
|
@@ -143,8 +110,8 @@
|
|
|
143
110
|
<option value="$PROJECT_DIR$/lib/http_crawler/web/baidu/client.rb" />
|
|
144
111
|
<option value="$PROJECT_DIR$/lib/http_crawler/proxy.rb" />
|
|
145
112
|
<option value="$PROJECT_DIR$/lib/http_crawler/web.rb" />
|
|
146
|
-
<option value="$PROJECT_DIR$/lib/http_crawler/http.rb" />
|
|
147
113
|
<option value="$PROJECT_DIR$/lib/http_crawler/version.rb" />
|
|
114
|
+
<option value="$PROJECT_DIR$/lib/http_crawler/http.rb" />
|
|
148
115
|
</list>
|
|
149
116
|
</option>
|
|
150
117
|
</component>
|
|
@@ -167,7 +134,6 @@
|
|
|
167
134
|
<foldersAlwaysOnTop value="true" />
|
|
168
135
|
</navigator>
|
|
169
136
|
<panes>
|
|
170
|
-
<pane id="Scope" />
|
|
171
137
|
<pane id="ProjectPane">
|
|
172
138
|
<subPane>
|
|
173
139
|
<expand>
|
|
@@ -188,23 +154,23 @@
|
|
|
188
154
|
</path>
|
|
189
155
|
<path>
|
|
190
156
|
<item name="http_crawler" type="b2602c69:ProjectViewProjectNode" />
|
|
191
|
-
<item name="
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
<item name="
|
|
195
|
-
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
|
|
196
|
-
<item name="< RVM: ruby-2.4.1 >" type="70bed36:NamedLibraryElementNode" />
|
|
157
|
+
<item name="http_crawler" type="462c0819:PsiDirectoryNode" />
|
|
158
|
+
<item name="lib" type="462c0819:PsiDirectoryNode" />
|
|
159
|
+
<item name="http_crawler" type="462c0819:PsiDirectoryNode" />
|
|
160
|
+
<item name="net" type="462c0819:PsiDirectoryNode" />
|
|
197
161
|
</path>
|
|
198
162
|
<path>
|
|
199
163
|
<item name="http_crawler" type="b2602c69:ProjectViewProjectNode" />
|
|
200
|
-
<item name="
|
|
201
|
-
<item name="
|
|
202
|
-
<item name="
|
|
164
|
+
<item name="http_crawler" type="462c0819:PsiDirectoryNode" />
|
|
165
|
+
<item name="lib" type="462c0819:PsiDirectoryNode" />
|
|
166
|
+
<item name="http_crawler" type="462c0819:PsiDirectoryNode" />
|
|
167
|
+
<item name="web" type="462c0819:PsiDirectoryNode" />
|
|
203
168
|
</path>
|
|
204
169
|
</expand>
|
|
205
170
|
<select />
|
|
206
171
|
</subPane>
|
|
207
172
|
</pane>
|
|
173
|
+
<pane id="Scope" />
|
|
208
174
|
</panes>
|
|
209
175
|
</component>
|
|
210
176
|
<component name="PropertiesComponent">
|
|
@@ -251,18 +217,19 @@
|
|
|
251
217
|
<workItem from="1546240992243" duration="719000" />
|
|
252
218
|
<workItem from="1546291493927" duration="464000" />
|
|
253
219
|
<workItem from="1546436457874" duration="2443000" />
|
|
254
|
-
<workItem from="1549964225949" duration="
|
|
220
|
+
<workItem from="1549964225949" duration="1209000" />
|
|
221
|
+
<workItem from="1550132724592" duration="3006000" />
|
|
222
|
+
<workItem from="1550208979012" duration="304000" />
|
|
255
223
|
</task>
|
|
256
224
|
<servers />
|
|
257
225
|
</component>
|
|
258
226
|
<component name="TimeTrackingManager">
|
|
259
|
-
<option name="totallyTimeSpent" value="
|
|
227
|
+
<option name="totallyTimeSpent" value="27627000" />
|
|
260
228
|
</component>
|
|
261
229
|
<component name="ToolWindowManager">
|
|
262
230
|
<frame x="0" y="0" width="1680" height="1050" extended-state="0" />
|
|
263
|
-
<editor active="true" />
|
|
264
231
|
<layout>
|
|
265
|
-
<window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.
|
|
232
|
+
<window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.35042736" />
|
|
266
233
|
<window_info anchor="bottom" id="TODO" order="6" />
|
|
267
234
|
<window_info anchor="bottom" id="Docker" order="7" show_stripe_button="false" />
|
|
268
235
|
<window_info anchor="bottom" id="Event Log" order="7" side_tool="true" />
|
|
@@ -270,7 +237,7 @@
|
|
|
270
237
|
<window_info anchor="bottom" id="Database Changes" order="7" show_stripe_button="false" />
|
|
271
238
|
<window_info anchor="bottom" id="Version Control" order="7" />
|
|
272
239
|
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
|
|
273
|
-
<window_info
|
|
240
|
+
<window_info anchor="bottom" id="Terminal" order="7" weight="0.45725647" />
|
|
274
241
|
<window_info id="Favorites" order="2" side_tool="true" />
|
|
275
242
|
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
|
|
276
243
|
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
|
|
@@ -508,13 +475,6 @@
|
|
|
508
475
|
</state>
|
|
509
476
|
</provider>
|
|
510
477
|
</entry>
|
|
511
|
-
<entry file="file://$PROJECT_DIR$/lib/http_crawler/net/response.rb">
|
|
512
|
-
<provider selected="true" editor-type-id="text-editor">
|
|
513
|
-
<state relative-caret-position="90">
|
|
514
|
-
<caret line="6" column="45" lean-forward="true" selection-start-line="6" selection-start-column="45" selection-end-line="6" selection-end-column="45" />
|
|
515
|
-
</state>
|
|
516
|
-
</provider>
|
|
517
|
-
</entry>
|
|
518
478
|
<entry file="file://$PROJECT_DIR$/lib/http_crawler.rb">
|
|
519
479
|
<provider selected="true" editor-type-id="text-editor">
|
|
520
480
|
<state relative-caret-position="120">
|
|
@@ -550,13 +510,6 @@
|
|
|
550
510
|
</state>
|
|
551
511
|
</provider>
|
|
552
512
|
</entry>
|
|
553
|
-
<entry file="file://$PROJECT_DIR$/lib/http_crawler/client.rb">
|
|
554
|
-
<provider selected="true" editor-type-id="text-editor">
|
|
555
|
-
<state relative-caret-position="90">
|
|
556
|
-
<caret line="6" column="36" selection-start-line="6" selection-start-column="36" selection-end-line="6" selection-end-column="36" />
|
|
557
|
-
</state>
|
|
558
|
-
</provider>
|
|
559
|
-
</entry>
|
|
560
513
|
<entry file="file://$PROJECT_DIR$/lib/http_crawler/common.rb">
|
|
561
514
|
<provider selected="true" editor-type-id="text-editor">
|
|
562
515
|
<state relative-caret-position="42">
|
|
@@ -564,13 +517,6 @@
|
|
|
564
517
|
</state>
|
|
565
518
|
</provider>
|
|
566
519
|
</entry>
|
|
567
|
-
<entry file="file://$PROJECT_DIR$/lib/http_crawler/proxy.rb">
|
|
568
|
-
<provider selected="true" editor-type-id="text-editor">
|
|
569
|
-
<state relative-caret-position="441">
|
|
570
|
-
<caret line="21" selection-start-line="21" selection-end-line="21" />
|
|
571
|
-
</state>
|
|
572
|
-
</provider>
|
|
573
|
-
</entry>
|
|
574
520
|
<entry file="file://$PROJECT_DIR$/lib/http_crawler/web.rb">
|
|
575
521
|
<provider selected="true" editor-type-id="text-editor">
|
|
576
522
|
<state relative-caret-position="126">
|
|
@@ -578,13 +524,6 @@
|
|
|
578
524
|
</state>
|
|
579
525
|
</provider>
|
|
580
526
|
</entry>
|
|
581
|
-
<entry file="file://$PROJECT_DIR$/lib/http_crawler/web/baidu/client.rb">
|
|
582
|
-
<provider selected="true" editor-type-id="text-editor">
|
|
583
|
-
<state relative-caret-position="21">
|
|
584
|
-
<caret line="1" column="18" selection-start-line="1" selection-start-column="7" selection-end-line="1" selection-end-column="18" />
|
|
585
|
-
</state>
|
|
586
|
-
</provider>
|
|
587
|
-
</entry>
|
|
588
527
|
<entry file="file://$PROJECT_DIR$/Gemfile">
|
|
589
528
|
<provider selected="true" editor-type-id="text-editor">
|
|
590
529
|
<state relative-caret-position="168">
|
|
@@ -599,13 +538,6 @@
|
|
|
599
538
|
</state>
|
|
600
539
|
</provider>
|
|
601
540
|
</entry>
|
|
602
|
-
<entry file="file://$PROJECT_DIR$/lib/http_crawler/http.rb">
|
|
603
|
-
<provider selected="true" editor-type-id="text-editor">
|
|
604
|
-
<state relative-caret-position="4410">
|
|
605
|
-
<caret line="210" column="11" lean-forward="true" selection-start-line="210" selection-start-column="11" selection-end-line="210" selection-end-column="11" />
|
|
606
|
-
</state>
|
|
607
|
-
</provider>
|
|
608
|
-
</entry>
|
|
609
541
|
<entry file="file://$APPLICATION_HOME_DIR$/rubystubs24/system_call_error.rb">
|
|
610
542
|
<provider selected="true" editor-type-id="text-editor">
|
|
611
543
|
<state relative-caret-position="150">
|
|
@@ -622,8 +554,43 @@
|
|
|
622
554
|
</entry>
|
|
623
555
|
<entry file="file://$PROJECT_DIR$/lib/http_crawler/version.rb">
|
|
624
556
|
<provider selected="true" editor-type-id="text-editor">
|
|
625
|
-
<state relative-caret-position="
|
|
626
|
-
<caret line="
|
|
557
|
+
<state relative-caret-position="63">
|
|
558
|
+
<caret line="3" selection-start-line="3" selection-end-line="3" />
|
|
559
|
+
</state>
|
|
560
|
+
</provider>
|
|
561
|
+
</entry>
|
|
562
|
+
<entry file="file://$PROJECT_DIR$/lib/http_crawler/proxy.rb">
|
|
563
|
+
<provider selected="true" editor-type-id="text-editor">
|
|
564
|
+
<state relative-caret-position="441">
|
|
565
|
+
<caret line="21" selection-start-line="21" selection-end-line="21" />
|
|
566
|
+
</state>
|
|
567
|
+
</provider>
|
|
568
|
+
</entry>
|
|
569
|
+
<entry file="file://$PROJECT_DIR$/lib/http_crawler/client.rb">
|
|
570
|
+
<provider selected="true" editor-type-id="text-editor">
|
|
571
|
+
<state relative-caret-position="483">
|
|
572
|
+
<caret line="23" column="19" selection-start-line="23" selection-start-column="19" selection-end-line="23" selection-end-column="19" />
|
|
573
|
+
</state>
|
|
574
|
+
</provider>
|
|
575
|
+
</entry>
|
|
576
|
+
<entry file="file://$PROJECT_DIR$/lib/http_crawler/web/baidu/client.rb">
|
|
577
|
+
<provider selected="true" editor-type-id="text-editor">
|
|
578
|
+
<state relative-caret-position="105">
|
|
579
|
+
<caret line="5" selection-start-line="5" selection-end-line="5" />
|
|
580
|
+
</state>
|
|
581
|
+
</provider>
|
|
582
|
+
</entry>
|
|
583
|
+
<entry file="file://$PROJECT_DIR$/lib/http_crawler/http.rb">
|
|
584
|
+
<provider selected="true" editor-type-id="text-editor">
|
|
585
|
+
<state relative-caret-position="504">
|
|
586
|
+
<caret line="24" column="22" selection-start-line="24" selection-start-column="22" selection-end-line="24" selection-end-column="22" />
|
|
587
|
+
</state>
|
|
588
|
+
</provider>
|
|
589
|
+
</entry>
|
|
590
|
+
<entry file="file://$PROJECT_DIR$/lib/http_crawler/net/response.rb">
|
|
591
|
+
<provider selected="true" editor-type-id="text-editor">
|
|
592
|
+
<state relative-caret-position="65">
|
|
593
|
+
<caret line="4" column="21" selection-start-line="4" selection-start-column="8" selection-end-line="4" selection-end-column="21" />
|
|
627
594
|
</state>
|
|
628
595
|
</provider>
|
|
629
596
|
</entry>
|
data/lib/http_crawler/client.rb
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
load File.dirname(__FILE__) + '/http/response.rb'
|
|
2
|
+
|
|
1
3
|
module HttpCrawler
|
|
2
4
|
module Client
|
|
3
5
|
|
|
@@ -21,43 +23,32 @@ module HttpCrawler
|
|
|
21
23
|
end
|
|
22
24
|
end
|
|
23
25
|
|
|
24
|
-
attr_reader :
|
|
26
|
+
attr_reader :uri
|
|
25
27
|
|
|
26
|
-
#
|
|
27
28
|
# init_uri 如果未初始化@uri,则会报错
|
|
28
|
-
#
|
|
29
|
+
# 继承类需要实现 @uri = URI("http://host")
|
|
29
30
|
#
|
|
30
|
-
def
|
|
31
|
-
|
|
32
|
-
@http = HttpCrawler::HTTP.new(uri.host, uri.port)
|
|
33
|
-
|
|
34
|
-
@http.use_ssl = (uri.scheme == "https")
|
|
35
|
-
|
|
36
|
-
@http.open_timeout = 5
|
|
37
|
-
@http.read_timeout = 5
|
|
38
|
-
@http.proxy_key = "#{self.class}"
|
|
39
|
-
init_http
|
|
40
|
-
|
|
41
|
-
Rails.logger.debug "proxy_key => #{@http.proxy_key}"
|
|
42
|
-
end
|
|
43
|
-
|
|
44
|
-
# 初始化http参数
|
|
45
|
-
def init_http
|
|
46
|
-
|
|
31
|
+
def init_uri
|
|
32
|
+
@uri = nil
|
|
47
33
|
end
|
|
48
34
|
|
|
49
|
-
#
|
|
50
|
-
def
|
|
51
|
-
@
|
|
35
|
+
# 初始化超时时间
|
|
36
|
+
def init_timeout
|
|
37
|
+
@connect_time = 5
|
|
38
|
+
@write_time = 2
|
|
39
|
+
@read_time = 5
|
|
52
40
|
end
|
|
53
41
|
|
|
54
|
-
#
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
42
|
+
# 初始化 ssl 协议
|
|
43
|
+
def init_ssl
|
|
44
|
+
if (@uri.scheme == "https")
|
|
45
|
+
# ssl 协议
|
|
46
|
+
@ctx = OpenSSL::SSL::SSLContext.new
|
|
47
|
+
@ctx.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
|
48
|
+
end
|
|
59
49
|
end
|
|
60
50
|
|
|
51
|
+
# 头文件相关方法
|
|
61
52
|
def header
|
|
62
53
|
@header ||= init_header
|
|
63
54
|
end
|
|
@@ -70,20 +61,145 @@ module HttpCrawler
|
|
|
70
61
|
nil
|
|
71
62
|
end
|
|
72
63
|
|
|
73
|
-
|
|
74
|
-
|
|
64
|
+
# cookies
|
|
65
|
+
def cookies
|
|
66
|
+
@cookies ||= {}
|
|
75
67
|
end
|
|
76
68
|
|
|
69
|
+
|
|
70
|
+
# 代理设置
|
|
77
71
|
def auto_proxy=(value)
|
|
78
72
|
Rails.logger.debug "自动更新代理"
|
|
79
|
-
@
|
|
80
|
-
|
|
73
|
+
@auto_proxy = value
|
|
74
|
+
update_proxy if (value == true && @proxy.blank?)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# 代理使用的api方法名
|
|
78
|
+
def proxy_api
|
|
79
|
+
@proxy_api ||= "my"
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# 调用代理 api使用的参数
|
|
83
|
+
def proxy_params
|
|
84
|
+
@proxy_params ||= {"key": "default"}
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def update_proxy(proxy = {})
|
|
88
|
+
if (proxy.blank?)
|
|
89
|
+
@proxy = get_proxy
|
|
90
|
+
else
|
|
91
|
+
@proxy = proxy
|
|
92
|
+
end
|
|
93
|
+
# @http.update_proxy(proxy)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# 如果自动更新代理 则更新代理返回 true,否则返回false
|
|
98
|
+
def update_proxy?(proxy_ip = {})
|
|
99
|
+
if @auto_proxy
|
|
100
|
+
update_proxy(proxy_ip)
|
|
101
|
+
return true
|
|
102
|
+
else
|
|
103
|
+
return false
|
|
104
|
+
end
|
|
81
105
|
end
|
|
82
106
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
107
|
+
|
|
108
|
+
# 获取proxy
|
|
109
|
+
# 通过调用 api 获取代理或者通过自定义设置代理
|
|
110
|
+
def get_proxy
|
|
111
|
+
proxy_ip = nil
|
|
112
|
+
begin
|
|
113
|
+
Rails.logger.debug("开始获取代理IP")
|
|
114
|
+
proxy_client = HttpCrawler::Proxy.for(proxy_api)
|
|
115
|
+
proxy_r = proxy_client.get_proxy(proxy_params)
|
|
116
|
+
proxy_ip = proxy_r.results unless proxy_r.results.blank?
|
|
117
|
+
if proxy_ip.blank?
|
|
118
|
+
Rails.logger.warn "无最新代理等待5秒后重新获取"
|
|
119
|
+
else
|
|
120
|
+
break
|
|
121
|
+
end
|
|
122
|
+
sleep(5)
|
|
123
|
+
end while true
|
|
124
|
+
|
|
125
|
+
Rails.logger.debug("当前IP => #{@proxy},获取最新代理 => #{proxy_ip}")
|
|
126
|
+
|
|
127
|
+
unless proxy_ip && proxy_ip["p_addr"] && proxy_ip["p_port"]
|
|
128
|
+
Rails.logger.warn "无最新代理等待5秒后重新获取"
|
|
129
|
+
sleep(5)
|
|
130
|
+
proxy_ip = get_proxy
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
if (@proxy && proxy_ip && @proxy["p_addr"] == proxy_ip["p_addr"] && @proxy["p_port"] == proxy_ip["p_port"])
|
|
134
|
+
Rails.logger.warn "无最新代理等待5秒后重新获取"
|
|
135
|
+
sleep(5)
|
|
136
|
+
proxy_ip = get_proxy
|
|
137
|
+
end
|
|
138
|
+
proxy_ip
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# 添加错误的url地址,表示这里面的url都是异常地址,存的是正则
|
|
142
|
+
def add_error_url(url_string)
|
|
143
|
+
@http.error_urls << url_string
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
# 初始化http参数
|
|
148
|
+
def init_client
|
|
149
|
+
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
# 初始化http请求前置条件
|
|
153
|
+
def http
|
|
154
|
+
# 自动重定向。最大重定向次数 max_hops: 5
|
|
155
|
+
h = HTTP.follow(max_hops: 5)
|
|
156
|
+
|
|
157
|
+
# 添加代理
|
|
158
|
+
h = h.via(@proxy["p_addr"], @proxy["p_port"].to_i, @proxy["p_user"], @proxy["p_pass"]) unless (@proxy.blank?)
|
|
159
|
+
|
|
160
|
+
# 添加头文件
|
|
161
|
+
h = h.headers(header) if header
|
|
162
|
+
|
|
163
|
+
# 添加cookies
|
|
164
|
+
h = h.cookies(cookies) if cookies
|
|
165
|
+
|
|
166
|
+
# 添加超时时间
|
|
167
|
+
h = h.timeout(connect: @connect_time, write: @write_time, read: @read_time)
|
|
168
|
+
|
|
169
|
+
h
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# 发送 get 请求
|
|
173
|
+
def get(path, params = {})
|
|
174
|
+
http.get((@uri + path).to_s, :params => params, :ssl_context => @ctx)
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
# 发送 post 请求
|
|
178
|
+
def post(path, params = {})
|
|
179
|
+
http.post((@uri + path).to_s, :form => params, :ssl_context => @ctx)
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
#
|
|
183
|
+
# init_uri 如果未初始化@uri,则会报错
|
|
184
|
+
# 继承类需要重定义 init_uri
|
|
185
|
+
#
|
|
186
|
+
def initialize
|
|
187
|
+
# 初始化 uri
|
|
188
|
+
raise "Client uri为空" unless init_uri
|
|
189
|
+
|
|
190
|
+
# 初始化超时时间
|
|
191
|
+
init_timeout
|
|
192
|
+
|
|
193
|
+
# 初始化 ssl 协议
|
|
194
|
+
init_ssl
|
|
195
|
+
|
|
196
|
+
# 初始化一些 client 自定义参数
|
|
197
|
+
init_client
|
|
198
|
+
|
|
199
|
+
# 初始化 代理参数
|
|
200
|
+
@proxy_params = {key: "#{self.class}"}
|
|
86
201
|
end
|
|
87
202
|
|
|
88
203
|
end
|
|
89
|
-
end
|
|
204
|
+
end
|
|
205
|
+
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
module HTTP
|
|
2
|
+
class Response
|
|
3
|
+
|
|
4
|
+
# 解压并转码 body 数据
|
|
5
|
+
def decoding_body
|
|
6
|
+
@decoding_body ||= self.to_s
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
# def decoding_body
|
|
10
|
+
|
|
11
|
+
def html
|
|
12
|
+
@html ||= Nokogiri::HTML(decoding_body)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def json
|
|
16
|
+
@json ||= JSON.parse(decoding_body)
|
|
17
|
+
@json = JSON.parse(@json) if String === @json
|
|
18
|
+
@json
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# 通过readability 解析数据
|
|
22
|
+
def readability
|
|
23
|
+
@readability ||= Readability::Document.new(decoding_body, {do_not_guess_encoding: true})
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# 解析
|
|
27
|
+
def parsing
|
|
28
|
+
json
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# 获取解析结果
|
|
32
|
+
def results
|
|
33
|
+
@results ||= parsing
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def get_date(str)
|
|
37
|
+
time = Time.now
|
|
38
|
+
case str
|
|
39
|
+
when /^(\d{1,2})小时前$/
|
|
40
|
+
time = time - $1.to_i.hours
|
|
41
|
+
when /^(\d{1,2})月(\d{1,2})日$/
|
|
42
|
+
time = Time.local(time.year, $1.to_i, $2.to_i)
|
|
43
|
+
when /^(\d{4})年(\d{1,2})月(\d{1,2})日$/
|
|
44
|
+
time = Time.local($1.to_i, $2.to_i, $3.to_i)
|
|
45
|
+
when /^(\d{1,2})月(\d{1,2})日[ ]{0,3}(\d{1,2}):(\d{1,2})$/ # 09月30日 12:04
|
|
46
|
+
time = Time.local(time.year, $1.to_i, $2.to_i, $3.to_i, $4.to_i)
|
|
47
|
+
end
|
|
48
|
+
return time
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
end # class Net::HTTPResponse
|
|
52
|
+
end
|
data/lib/http_crawler/http.rb
CHANGED
|
@@ -158,7 +158,9 @@ module HttpCrawler
|
|
|
158
158
|
response.error!
|
|
159
159
|
end
|
|
160
160
|
else
|
|
161
|
-
|
|
161
|
+
Rails.logger.debug uri_or_path
|
|
162
|
+
Rails.logger.debug initheader
|
|
163
|
+
Rails.logger.debug response.body
|
|
162
164
|
response.error!
|
|
163
165
|
end
|
|
164
166
|
end
|
|
@@ -255,5 +257,4 @@ module HttpCrawler
|
|
|
255
257
|
end
|
|
256
258
|
|
|
257
259
|
|
|
258
|
-
load File.dirname(__FILE__) + '/
|
|
259
|
-
load File.dirname(__FILE__) + '/net/response.rb'
|
|
260
|
+
load File.dirname(__FILE__) + '/http/response.rb'
|
|
@@ -22,7 +22,7 @@ module HttpCrawler
|
|
|
22
22
|
|
|
23
23
|
# http://39.108.59.38:7772/Tools/proxyIP.ashx?OrderNumber=ccd4c8912691f28861a1ed048fec88dc&poolIndex=22717&cache=1&qty=2
|
|
24
24
|
def get_proxy(parameter = {})
|
|
25
|
-
r = http.
|
|
25
|
+
r = http.get("/api/get_proxy")
|
|
26
26
|
r.extend(HttpCrawler::Proxy::TestProxyApi::Response::GetProxy)
|
|
27
27
|
end
|
|
28
28
|
|
data/lib/http_crawler/version.rb
CHANGED
|
@@ -6,20 +6,24 @@ module HttpCrawler
|
|
|
6
6
|
|
|
7
7
|
include(HttpCrawler::Client)
|
|
8
8
|
|
|
9
|
-
def init_http
|
|
10
|
-
@http.open_timeout = 3
|
|
11
|
-
@http.read_timeout = 3
|
|
12
|
-
end
|
|
13
|
-
|
|
14
9
|
def init_uri
|
|
15
|
-
@uri = URI("https://www.baidu.com
|
|
10
|
+
@uri = URI("https://www.baidu.com")
|
|
16
11
|
end
|
|
17
12
|
|
|
18
13
|
def index(parameter = {})
|
|
19
|
-
r =
|
|
14
|
+
r = get("/")
|
|
20
15
|
r.extend(HttpCrawler::Web::Baidu::Response::Index)
|
|
21
16
|
end
|
|
22
17
|
|
|
18
|
+
def search(parameter = {})
|
|
19
|
+
raise "parameter[:keyword] 不能为空" unless parameter[:keyword]
|
|
20
|
+
params = {
|
|
21
|
+
"wd": parameter[:keyword]
|
|
22
|
+
}
|
|
23
|
+
r = get("/s",params)
|
|
24
|
+
r
|
|
25
|
+
end
|
|
26
|
+
|
|
23
27
|
end
|
|
24
28
|
end # module Baidu
|
|
25
29
|
end # module Web
|
data/lib/http_crawler.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: http_crawler
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.3.0.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- jagger
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2019-02-
|
|
11
|
+
date: 2019-02-16 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rspec
|
|
@@ -118,6 +118,7 @@ files:
|
|
|
118
118
|
- ".gitignore"
|
|
119
119
|
- ".idea/.rakeTasks"
|
|
120
120
|
- ".idea/http_crawler.iml"
|
|
121
|
+
- ".idea/inspectionProfiles/Project_Default.xml"
|
|
121
122
|
- ".idea/misc.xml"
|
|
122
123
|
- ".idea/modules.xml"
|
|
123
124
|
- ".idea/vcs.xml"
|
|
@@ -136,8 +137,7 @@ files:
|
|
|
136
137
|
- lib/http_crawler/common/object.rb
|
|
137
138
|
- lib/http_crawler/common/string.rb
|
|
138
139
|
- lib/http_crawler/http.rb
|
|
139
|
-
- lib/http_crawler/
|
|
140
|
-
- lib/http_crawler/net/response.rb
|
|
140
|
+
- lib/http_crawler/http/response.rb
|
|
141
141
|
- lib/http_crawler/proxy.rb
|
|
142
142
|
- lib/http_crawler/proxy/README.md
|
|
143
143
|
- lib/http_crawler/proxy/client.rb
|
|
@@ -1,105 +0,0 @@
|
|
|
1
|
-
module Net
|
|
2
|
-
class HTTPResponse
|
|
3
|
-
|
|
4
|
-
# 解压并转码 body 数据
|
|
5
|
-
def decoding_body
|
|
6
|
-
|
|
7
|
-
return @decoding_body if @decoding_body
|
|
8
|
-
return nil unless body
|
|
9
|
-
|
|
10
|
-
# 数据解压
|
|
11
|
-
case header['Content-Encoding']
|
|
12
|
-
when 'gzip' then
|
|
13
|
-
sio = StringIO.new(body)
|
|
14
|
-
gz = Zlib::GzipReader.new(sio)
|
|
15
|
-
@decoding_body = gz.read()
|
|
16
|
-
when 'br'
|
|
17
|
-
@decoding_body = Brotli.inflate(body)
|
|
18
|
-
when 'deflate'
|
|
19
|
-
# 可能错误代码 暂时没解决 deflate 编码格式
|
|
20
|
-
@decoding_body = Zlib::Inflate.inflate(body)
|
|
21
|
-
else
|
|
22
|
-
@decoding_body = body
|
|
23
|
-
end
|
|
24
|
-
|
|
25
|
-
# 判断解压后数据编码格式
|
|
26
|
-
|
|
27
|
-
# 从header取编码格式
|
|
28
|
-
encoding = header['Content-Type'][/charset=([^, ;"]*)/, 1] if header['Content-Type']
|
|
29
|
-
|
|
30
|
-
# 从html中的 charset 取编码格式
|
|
31
|
-
encoding = @decoding_body[/charset=([^, ;"]*)/, 1] unless encoding
|
|
32
|
-
|
|
33
|
-
# 通过 CharDet 判断编码格式
|
|
34
|
-
encoding = CharDet.detect(@decoding_body)["encoding"] unless encoding
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
# 进行转码
|
|
38
|
-
begin
|
|
39
|
-
@decoding_body.force_encoding(encoding).encode!('utf-8') if encoding && encoding != @decoding_body.encoding
|
|
40
|
-
rescue => e
|
|
41
|
-
# 转码错误后再次使用 CharDet 判断编码格式后进行转码
|
|
42
|
-
cd = CharDet.detect(@decoding_body)["encoding"]
|
|
43
|
-
if (cd && cd != encoding)
|
|
44
|
-
@decoding_body.force_encoding(cd).encode!('utf-8') if encoding != @decoding_body.encoding
|
|
45
|
-
else
|
|
46
|
-
# 还是转码错误则抛出异常
|
|
47
|
-
Rails.logger.debug "encoding => #{encoding}"
|
|
48
|
-
Rails.logger.debug "cd => #{cd}"
|
|
49
|
-
Rails.logger.debug "@decoding_body[0..200] => #{@decoding_body[0..200]}"
|
|
50
|
-
raise e
|
|
51
|
-
end
|
|
52
|
-
end
|
|
53
|
-
|
|
54
|
-
@decoding_body
|
|
55
|
-
end
|
|
56
|
-
|
|
57
|
-
# def decoding_body
|
|
58
|
-
|
|
59
|
-
def html
|
|
60
|
-
@html ||= Nokogiri::HTML(decoding_body)
|
|
61
|
-
end
|
|
62
|
-
|
|
63
|
-
def json
|
|
64
|
-
@json ||= JSON.parse(decoding_body)
|
|
65
|
-
@json = JSON.parse(@json) if String === @json
|
|
66
|
-
@json
|
|
67
|
-
end
|
|
68
|
-
|
|
69
|
-
# 通过readability 解析数据
|
|
70
|
-
def readability
|
|
71
|
-
@readability ||= Readability::Document.new(decoding_body, {do_not_guess_encoding: true})
|
|
72
|
-
end
|
|
73
|
-
|
|
74
|
-
# 解析
|
|
75
|
-
def parsing
|
|
76
|
-
json
|
|
77
|
-
end
|
|
78
|
-
|
|
79
|
-
# 获取解析结果
|
|
80
|
-
def results
|
|
81
|
-
@results ||= parsing
|
|
82
|
-
end
|
|
83
|
-
|
|
84
|
-
def get_date(str)
|
|
85
|
-
time = Time.now
|
|
86
|
-
case str
|
|
87
|
-
when /^(\d{1,2})小时前$/
|
|
88
|
-
time = time - $1.to_i.hours
|
|
89
|
-
when /^(\d{1,2})月(\d{1,2})日$/
|
|
90
|
-
time = Time.local(time.year, $1.to_i, $2.to_i)
|
|
91
|
-
when /^(\d{4})年(\d{1,2})月(\d{1,2})日$/
|
|
92
|
-
time = Time.local($1.to_i, $2.to_i, $3.to_i)
|
|
93
|
-
when /^(\d{1,2})月(\d{1,2})日[ ]{0,3}(\d{1,2}):(\d{1,2})$/ # 09月30日 12:04
|
|
94
|
-
time = Time.local(time.year, $1.to_i, $2.to_i, $3.to_i, $4.to_i)
|
|
95
|
-
end
|
|
96
|
-
return time
|
|
97
|
-
end
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
# 是否是网站验证 true表示正常数据、false表示弹出网站验证
|
|
101
|
-
def web_verify(*arg)
|
|
102
|
-
true
|
|
103
|
-
end
|
|
104
|
-
end # class Net::HTTPResponse
|
|
105
|
-
end
|