http_crawler 0.2.3.3 → 0.3.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1143552ea7737865d9fe582c48b7fa3f5824da47cbb0e16d8911050bd807dc68
4
- data.tar.gz: 0e55bdd075e617e1f60e9fb6a54f75366c7cc670801d6a49b48c007e612b00a3
3
+ metadata.gz: 870f70e609b513a6bdf5176e1ab2960d19e7b4f04d60c746845e6e27343288c7
4
+ data.tar.gz: 6706312c08462cb4dee0f8c2eac15b7882131832dc49f6cb336032ffdc0ab68d
5
5
  SHA512:
6
- metadata.gz: e26f0d48ec8318b0d977933d95f091a8e9ce00ef872ef8668d66dcbacc653e984871210b132468b530fd7d30261bb2caae592b93f7d0fa73067432c7b4b38750
7
- data.tar.gz: 98004bfd606c248e367d18bac5cea84304a5f83854d9c9ac4adedbbf5ee84a6533f6d909ffa73afaf66f0313605093f84da2bde25c78c3e6ed60b86e7e8c71f7
6
+ metadata.gz: 3c47791191aca7f3065eee2c9e223e3196f0890a4da8cd4bf8dee697eae18133758ac3225d49503d6ece004bc55474bbd9486b140b2d1588c92ee23d8b06663d
7
+ data.tar.gz: ff4da1b65de1431b9e6aa800e63d9e0b4c97ee942b68760136deb3bd819c08fc82504159e1a1ea14b6368458e8fa6a7568521a12d0eff259c637e2092791ea6f
@@ -0,0 +1,6 @@
1
+ <component name="InspectionProjectProfileManager">
2
+ <profile version="1.0">
3
+ <option name="myName" value="Project Default" />
4
+ <inspection_tool class="Rubocop" enabled="false" level="WARNING" enabled_by_default="false" />
5
+ </profile>
6
+ </component>
data/.idea/workspace.xml CHANGED
@@ -2,6 +2,7 @@
2
2
  <project version="4">
3
3
  <component name="ChangeListManager">
4
4
  <list default="true" id="07223dd4-8944-486b-a29b-7461a5c9ec2d" name="Default" comment="">
5
+ <change afterPath="$PROJECT_DIR$/.idea/inspectionProfiles/Project_Default.xml" afterDir="false" />
5
6
  <change beforePath="$PROJECT_DIR$/.idea/http_crawler.iml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/http_crawler.iml" afterDir="false" />
6
7
  <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
7
8
  <change beforePath="$PROJECT_DIR$/lib/http_crawler/http.rb" beforeDir="false" afterPath="$PROJECT_DIR$/lib/http_crawler/http.rb" afterDir="false" />
@@ -17,47 +18,11 @@
17
18
  </component>
18
19
  <component name="FileEditorManager">
19
20
  <leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
20
- <file leaf-file-name="version.rb" pinned="false" current-in-tab="true">
21
- <entry file="file://$PROJECT_DIR$/lib/http_crawler/version.rb">
22
- <provider selected="true" editor-type-id="text-editor">
23
- <state relative-caret-position="21">
24
- <caret line="1" column="21" selection-start-line="1" selection-start-column="21" selection-end-line="1" selection-end-column="21" />
25
- </state>
26
- </provider>
27
- </entry>
28
- </file>
29
- <file leaf-file-name="web.rb" pinned="false" current-in-tab="false">
30
- <entry file="file://$PROJECT_DIR$/lib/http_crawler/web.rb">
31
- <provider selected="true" editor-type-id="text-editor">
32
- <state relative-caret-position="126">
33
- <caret line="6" selection-start-line="6" selection-end-line="6" />
34
- </state>
35
- </provider>
36
- </entry>
37
- </file>
38
21
  <file leaf-file-name="http.rb" pinned="false" current-in-tab="false">
39
22
  <entry file="file://$PROJECT_DIR$/lib/http_crawler/http.rb">
40
23
  <provider selected="true" editor-type-id="text-editor">
41
- <state relative-caret-position="4410">
42
- <caret line="210" column="11" lean-forward="true" selection-start-line="210" selection-start-column="11" selection-end-line="210" selection-end-column="11" />
43
- </state>
44
- </provider>
45
- </entry>
46
- </file>
47
- <file leaf-file-name="compat.rb" pinned="false" current-in-tab="false">
48
- <entry file="file://$USER_HOME$/.rvm/rubies/ruby-2.4.1/lib/ruby/2.4.0/webrick/compat.rb">
49
- <provider selected="true" editor-type-id="text-editor">
50
- <state relative-caret-position="378">
51
- <caret line="18" column="1" selection-start-line="18" selection-start-column="1" selection-end-line="18" selection-end-column="1" />
52
- </state>
53
- </provider>
54
- </entry>
55
- </file>
56
- <file leaf-file-name="errno.rb" pinned="false" current-in-tab="false">
57
- <entry file="file://$APPLICATION_HOME_DIR$/rubystubs24/errno.rb">
58
- <provider selected="true" editor-type-id="text-editor">
59
- <state relative-caret-position="672">
60
- <caret line="32" column="7" selection-start-line="32" selection-start-column="7" selection-end-line="32" selection-end-column="7" />
24
+ <state relative-caret-position="504">
25
+ <caret line="24" column="22" selection-start-line="24" selection-start-column="22" selection-end-line="24" selection-end-column="22" />
61
26
  </state>
62
27
  </provider>
63
28
  </entry>
@@ -71,11 +36,11 @@
71
36
  </provider>
72
37
  </entry>
73
38
  </file>
74
- <file leaf-file-name="proxy.rb" pinned="false" current-in-tab="false">
75
- <entry file="file://$PROJECT_DIR$/lib/http_crawler/proxy.rb">
39
+ <file leaf-file-name="client.rb" pinned="false" current-in-tab="false">
40
+ <entry file="file://$PROJECT_DIR$/lib/http_crawler/client.rb">
76
41
  <provider selected="true" editor-type-id="text-editor">
77
- <state relative-caret-position="441">
78
- <caret line="21" selection-start-line="21" selection-end-line="21" />
42
+ <state relative-caret-position="483">
43
+ <caret line="23" column="19" selection-start-line="23" selection-start-column="19" selection-end-line="23" selection-end-column="19" />
79
44
  </state>
80
45
  </provider>
81
46
  </entry>
@@ -83,17 +48,17 @@
83
48
  <file leaf-file-name="client.rb" pinned="false" current-in-tab="false">
84
49
  <entry file="file://$PROJECT_DIR$/lib/http_crawler/web/baidu/client.rb">
85
50
  <provider selected="true" editor-type-id="text-editor">
86
- <state relative-caret-position="21">
87
- <caret line="1" column="18" selection-start-line="1" selection-start-column="7" selection-end-line="1" selection-end-column="18" />
51
+ <state relative-caret-position="105">
52
+ <caret line="5" selection-start-line="5" selection-end-line="5" />
88
53
  </state>
89
54
  </provider>
90
55
  </entry>
91
56
  </file>
92
- <file leaf-file-name="Gemfile" pinned="false" current-in-tab="false">
93
- <entry file="file://$PROJECT_DIR$/Gemfile">
57
+ <file leaf-file-name="response.rb" pinned="false" current-in-tab="true">
58
+ <entry file="file://$PROJECT_DIR$/lib/http_crawler/net/response.rb">
94
59
  <provider selected="true" editor-type-id="text-editor">
95
- <state relative-caret-position="168">
96
- <caret line="8" column="23" selection-start-line="8" selection-start-column="23" selection-end-line="8" selection-end-column="23" />
60
+ <state relative-caret-position="65">
61
+ <caret line="4" column="21" selection-start-line="4" selection-start-column="8" selection-end-line="4" selection-end-column="21" />
97
62
  </state>
98
63
  </provider>
99
64
  </entry>
@@ -104,6 +69,8 @@
104
69
  <findStrings>
105
70
  <find>Crawler::Web</find>
106
71
  <find>&quot;Crawler</find>
72
+ <find>proxy</find>
73
+ <find>auto</find>
107
74
  </findStrings>
108
75
  <replaceStrings>
109
76
  <replace>HttpCrawler::Web</replace>
@@ -143,8 +110,8 @@
143
110
  <option value="$PROJECT_DIR$/lib/http_crawler/web/baidu/client.rb" />
144
111
  <option value="$PROJECT_DIR$/lib/http_crawler/proxy.rb" />
145
112
  <option value="$PROJECT_DIR$/lib/http_crawler/web.rb" />
146
- <option value="$PROJECT_DIR$/lib/http_crawler/http.rb" />
147
113
  <option value="$PROJECT_DIR$/lib/http_crawler/version.rb" />
114
+ <option value="$PROJECT_DIR$/lib/http_crawler/http.rb" />
148
115
  </list>
149
116
  </option>
150
117
  </component>
@@ -167,7 +134,6 @@
167
134
  <foldersAlwaysOnTop value="true" />
168
135
  </navigator>
169
136
  <panes>
170
- <pane id="Scope" />
171
137
  <pane id="ProjectPane">
172
138
  <subPane>
173
139
  <expand>
@@ -188,23 +154,23 @@
188
154
  </path>
189
155
  <path>
190
156
  <item name="http_crawler" type="b2602c69:ProjectViewProjectNode" />
191
- <item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
192
- </path>
193
- <path>
194
- <item name="http_crawler" type="b2602c69:ProjectViewProjectNode" />
195
- <item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
196
- <item name="&lt; RVM: ruby-2.4.1 &gt;" type="70bed36:NamedLibraryElementNode" />
157
+ <item name="http_crawler" type="462c0819:PsiDirectoryNode" />
158
+ <item name="lib" type="462c0819:PsiDirectoryNode" />
159
+ <item name="http_crawler" type="462c0819:PsiDirectoryNode" />
160
+ <item name="net" type="462c0819:PsiDirectoryNode" />
197
161
  </path>
198
162
  <path>
199
163
  <item name="http_crawler" type="b2602c69:ProjectViewProjectNode" />
200
- <item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
201
- <item name="&lt; RVM: ruby-2.4.1 &gt;" type="70bed36:NamedLibraryElementNode" />
202
- <item name="rubystubs24" type="462c0819:PsiDirectoryNode" />
164
+ <item name="http_crawler" type="462c0819:PsiDirectoryNode" />
165
+ <item name="lib" type="462c0819:PsiDirectoryNode" />
166
+ <item name="http_crawler" type="462c0819:PsiDirectoryNode" />
167
+ <item name="web" type="462c0819:PsiDirectoryNode" />
203
168
  </path>
204
169
  </expand>
205
170
  <select />
206
171
  </subPane>
207
172
  </pane>
173
+ <pane id="Scope" />
208
174
  </panes>
209
175
  </component>
210
176
  <component name="PropertiesComponent">
@@ -251,18 +217,19 @@
251
217
  <workItem from="1546240992243" duration="719000" />
252
218
  <workItem from="1546291493927" duration="464000" />
253
219
  <workItem from="1546436457874" duration="2443000" />
254
- <workItem from="1549964225949" duration="5000" />
220
+ <workItem from="1549964225949" duration="1209000" />
221
+ <workItem from="1550132724592" duration="3006000" />
222
+ <workItem from="1550208979012" duration="304000" />
255
223
  </task>
256
224
  <servers />
257
225
  </component>
258
226
  <component name="TimeTrackingManager">
259
- <option name="totallyTimeSpent" value="23113000" />
227
+ <option name="totallyTimeSpent" value="27627000" />
260
228
  </component>
261
229
  <component name="ToolWindowManager">
262
230
  <frame x="0" y="0" width="1680" height="1050" extended-state="0" />
263
- <editor active="true" />
264
231
  <layout>
265
- <window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.34188035" />
232
+ <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.35042736" />
266
233
  <window_info anchor="bottom" id="TODO" order="6" />
267
234
  <window_info anchor="bottom" id="Docker" order="7" show_stripe_button="false" />
268
235
  <window_info anchor="bottom" id="Event Log" order="7" side_tool="true" />
@@ -270,7 +237,7 @@
270
237
  <window_info anchor="bottom" id="Database Changes" order="7" show_stripe_button="false" />
271
238
  <window_info anchor="bottom" id="Version Control" order="7" />
272
239
  <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
273
- <window_info active="true" anchor="bottom" id="Terminal" order="7" visible="true" weight="0.45725647" />
240
+ <window_info anchor="bottom" id="Terminal" order="7" weight="0.45725647" />
274
241
  <window_info id="Favorites" order="2" side_tool="true" />
275
242
  <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
276
243
  <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
@@ -508,13 +475,6 @@
508
475
  </state>
509
476
  </provider>
510
477
  </entry>
511
- <entry file="file://$PROJECT_DIR$/lib/http_crawler/net/response.rb">
512
- <provider selected="true" editor-type-id="text-editor">
513
- <state relative-caret-position="90">
514
- <caret line="6" column="45" lean-forward="true" selection-start-line="6" selection-start-column="45" selection-end-line="6" selection-end-column="45" />
515
- </state>
516
- </provider>
517
- </entry>
518
478
  <entry file="file://$PROJECT_DIR$/lib/http_crawler.rb">
519
479
  <provider selected="true" editor-type-id="text-editor">
520
480
  <state relative-caret-position="120">
@@ -550,13 +510,6 @@
550
510
  </state>
551
511
  </provider>
552
512
  </entry>
553
- <entry file="file://$PROJECT_DIR$/lib/http_crawler/client.rb">
554
- <provider selected="true" editor-type-id="text-editor">
555
- <state relative-caret-position="90">
556
- <caret line="6" column="36" selection-start-line="6" selection-start-column="36" selection-end-line="6" selection-end-column="36" />
557
- </state>
558
- </provider>
559
- </entry>
560
513
  <entry file="file://$PROJECT_DIR$/lib/http_crawler/common.rb">
561
514
  <provider selected="true" editor-type-id="text-editor">
562
515
  <state relative-caret-position="42">
@@ -564,13 +517,6 @@
564
517
  </state>
565
518
  </provider>
566
519
  </entry>
567
- <entry file="file://$PROJECT_DIR$/lib/http_crawler/proxy.rb">
568
- <provider selected="true" editor-type-id="text-editor">
569
- <state relative-caret-position="441">
570
- <caret line="21" selection-start-line="21" selection-end-line="21" />
571
- </state>
572
- </provider>
573
- </entry>
574
520
  <entry file="file://$PROJECT_DIR$/lib/http_crawler/web.rb">
575
521
  <provider selected="true" editor-type-id="text-editor">
576
522
  <state relative-caret-position="126">
@@ -578,13 +524,6 @@
578
524
  </state>
579
525
  </provider>
580
526
  </entry>
581
- <entry file="file://$PROJECT_DIR$/lib/http_crawler/web/baidu/client.rb">
582
- <provider selected="true" editor-type-id="text-editor">
583
- <state relative-caret-position="21">
584
- <caret line="1" column="18" selection-start-line="1" selection-start-column="7" selection-end-line="1" selection-end-column="18" />
585
- </state>
586
- </provider>
587
- </entry>
588
527
  <entry file="file://$PROJECT_DIR$/Gemfile">
589
528
  <provider selected="true" editor-type-id="text-editor">
590
529
  <state relative-caret-position="168">
@@ -599,13 +538,6 @@
599
538
  </state>
600
539
  </provider>
601
540
  </entry>
602
- <entry file="file://$PROJECT_DIR$/lib/http_crawler/http.rb">
603
- <provider selected="true" editor-type-id="text-editor">
604
- <state relative-caret-position="4410">
605
- <caret line="210" column="11" lean-forward="true" selection-start-line="210" selection-start-column="11" selection-end-line="210" selection-end-column="11" />
606
- </state>
607
- </provider>
608
- </entry>
609
541
  <entry file="file://$APPLICATION_HOME_DIR$/rubystubs24/system_call_error.rb">
610
542
  <provider selected="true" editor-type-id="text-editor">
611
543
  <state relative-caret-position="150">
@@ -622,8 +554,43 @@
622
554
  </entry>
623
555
  <entry file="file://$PROJECT_DIR$/lib/http_crawler/version.rb">
624
556
  <provider selected="true" editor-type-id="text-editor">
625
- <state relative-caret-position="21">
626
- <caret line="1" column="21" selection-start-line="1" selection-start-column="21" selection-end-line="1" selection-end-column="21" />
557
+ <state relative-caret-position="63">
558
+ <caret line="3" selection-start-line="3" selection-end-line="3" />
559
+ </state>
560
+ </provider>
561
+ </entry>
562
+ <entry file="file://$PROJECT_DIR$/lib/http_crawler/proxy.rb">
563
+ <provider selected="true" editor-type-id="text-editor">
564
+ <state relative-caret-position="441">
565
+ <caret line="21" selection-start-line="21" selection-end-line="21" />
566
+ </state>
567
+ </provider>
568
+ </entry>
569
+ <entry file="file://$PROJECT_DIR$/lib/http_crawler/client.rb">
570
+ <provider selected="true" editor-type-id="text-editor">
571
+ <state relative-caret-position="483">
572
+ <caret line="23" column="19" selection-start-line="23" selection-start-column="19" selection-end-line="23" selection-end-column="19" />
573
+ </state>
574
+ </provider>
575
+ </entry>
576
+ <entry file="file://$PROJECT_DIR$/lib/http_crawler/web/baidu/client.rb">
577
+ <provider selected="true" editor-type-id="text-editor">
578
+ <state relative-caret-position="105">
579
+ <caret line="5" selection-start-line="5" selection-end-line="5" />
580
+ </state>
581
+ </provider>
582
+ </entry>
583
+ <entry file="file://$PROJECT_DIR$/lib/http_crawler/http.rb">
584
+ <provider selected="true" editor-type-id="text-editor">
585
+ <state relative-caret-position="504">
586
+ <caret line="24" column="22" selection-start-line="24" selection-start-column="22" selection-end-line="24" selection-end-column="22" />
587
+ </state>
588
+ </provider>
589
+ </entry>
590
+ <entry file="file://$PROJECT_DIR$/lib/http_crawler/net/response.rb">
591
+ <provider selected="true" editor-type-id="text-editor">
592
+ <state relative-caret-position="65">
593
+ <caret line="4" column="21" selection-start-line="4" selection-start-column="8" selection-end-line="4" selection-end-column="21" />
627
594
  </state>
628
595
  </provider>
629
596
  </entry>
@@ -1,3 +1,5 @@
1
+ load File.dirname(__FILE__) + '/http/response.rb'
2
+
1
3
  module HttpCrawler
2
4
  module Client
3
5
 
@@ -21,43 +23,32 @@ module HttpCrawler
21
23
  end
22
24
  end
23
25
 
24
- attr_reader :http, :uri
26
+ attr_reader :uri
25
27
 
26
- #
27
28
  # init_uri 如果未初始化@uri,则会报错
28
- # 继承类需要重定义 init_uri
29
+ # 继承类需要实现 @uri = URI("http://host")
29
30
  #
30
- def initialize
31
- raise "Client uri为空" unless init_uri
32
- @http = HttpCrawler::HTTP.new(uri.host, uri.port)
33
-
34
- @http.use_ssl = (uri.scheme == "https")
35
-
36
- @http.open_timeout = 5
37
- @http.read_timeout = 5
38
- @http.proxy_key = "#{self.class}"
39
- init_http
40
-
41
- Rails.logger.debug "proxy_key => #{@http.proxy_key}"
42
- end
43
-
44
- # 初始化http参数
45
- def init_http
46
-
31
+ def init_uri
32
+ @uri = nil
47
33
  end
48
34
 
49
- # 添加错误的url地址,表示这里面的url都是异常地址,存的是正则
50
- def add_error_url(url_string)
51
- @http.error_urls << url_string
35
+ # 初始化超时时间
36
+ def init_timeout
37
+ @connect_time = 5
38
+ @write_time = 2
39
+ @read_time = 5
52
40
  end
53
41
 
54
- # init_uri 如果未初始化@uri,则会报错
55
- # 继承类需要实现 @uri = URI("http://host")
56
- #
57
- def init_uri
58
- @uri = nil
42
+ # 初始化 ssl 协议
43
+ def init_ssl
44
+ if (@uri.scheme == "https")
45
+ # ssl 协议
46
+ @ctx = OpenSSL::SSL::SSLContext.new
47
+ @ctx.verify_mode = OpenSSL::SSL::VERIFY_NONE
48
+ end
59
49
  end
60
50
 
51
+ # 头文件相关方法
61
52
  def header
62
53
  @header ||= init_header
63
54
  end
@@ -70,20 +61,145 @@ module HttpCrawler
70
61
  nil
71
62
  end
72
63
 
73
- def update_proxy(proxy = {})
74
- @http.update_proxy(proxy)
64
+ # cookies
65
+ def cookies
66
+ @cookies ||= {}
75
67
  end
76
68
 
69
+
70
+ # 代理设置
77
71
  def auto_proxy=(value)
78
72
  Rails.logger.debug "自动更新代理"
79
- @http.auto_proxy = value
80
- @http.update_proxy if (value == true && @http.proxy? == false)
73
+ @auto_proxy = value
74
+ update_proxy if (value == true && @proxy.blank?)
75
+ end
76
+
77
+ # 代理使用的api方法名
78
+ def proxy_api
79
+ @proxy_api ||= "my"
80
+ end
81
+
82
+ # 调用代理 api使用的参数
83
+ def proxy_params
84
+ @proxy_params ||= {"key": "default"}
85
+ end
86
+
87
+ def update_proxy(proxy = {})
88
+ if (proxy.blank?)
89
+ @proxy = get_proxy
90
+ else
91
+ @proxy = proxy
92
+ end
93
+ # @http.update_proxy(proxy)
94
+ end
95
+
96
+
97
+ # 如果自动更新代理 则更新代理返回 true,否则返回false
98
+ def update_proxy?(proxy_ip = {})
99
+ if @auto_proxy
100
+ update_proxy(proxy_ip)
101
+ return true
102
+ else
103
+ return false
104
+ end
81
105
  end
82
106
 
83
- # 是否验证码界面
84
- def validation_page?(*arg)
85
- false
107
+
108
+ # 获取proxy
109
+ # 通过调用 api 获取代理或者通过自定义设置代理
110
+ def get_proxy
111
+ proxy_ip = nil
112
+ begin
113
+ Rails.logger.debug("开始获取代理IP")
114
+ proxy_client = HttpCrawler::Proxy.for(proxy_api)
115
+ proxy_r = proxy_client.get_proxy(proxy_params)
116
+ proxy_ip = proxy_r.results unless proxy_r.results.blank?
117
+ if proxy_ip.blank?
118
+ Rails.logger.warn "无最新代理等待5秒后重新获取"
119
+ else
120
+ break
121
+ end
122
+ sleep(5)
123
+ end while true
124
+
125
+ Rails.logger.debug("当前IP => #{@proxy},获取最新代理 => #{proxy_ip}")
126
+
127
+ unless proxy_ip && proxy_ip["p_addr"] && proxy_ip["p_port"]
128
+ Rails.logger.warn "无最新代理等待5秒后重新获取"
129
+ sleep(5)
130
+ proxy_ip = get_proxy
131
+ end
132
+
133
+ if (@proxy && proxy_ip && @proxy["p_addr"] == proxy_ip["p_addr"] && @proxy["p_port"] == proxy_ip["p_port"])
134
+ Rails.logger.warn "无最新代理等待5秒后重新获取"
135
+ sleep(5)
136
+ proxy_ip = get_proxy
137
+ end
138
+ proxy_ip
139
+ end
140
+
141
+ # 添加错误的url地址,表示这里面的url都是异常地址,存的是正则
142
+ def add_error_url(url_string)
143
+ @http.error_urls << url_string
144
+ end
145
+
146
+
147
+ # 初始化http参数
148
+ def init_client
149
+
150
+ end
151
+
152
+ # 初始化http请求前置条件
153
+ def http
154
+ # 自动重定向。最大重定向次数 max_hops: 5
155
+ h = HTTP.follow(max_hops: 5)
156
+
157
+ # 添加代理
158
+ h = h.via(@proxy["p_addr"], @proxy["p_port"].to_i, @proxy["p_user"], @proxy["p_pass"]) unless (@proxy.blank?)
159
+
160
+ # 添加头文件
161
+ h = h.headers(header) if header
162
+
163
+ # 添加cookies
164
+ h = h.cookies(cookies) if cookies
165
+
166
+ # 添加超时时间
167
+ h = h.timeout(connect: @connect_time, write: @write_time, read: @read_time)
168
+
169
+ h
170
+ end
171
+
172
+ # 发送 get 请求
173
+ def get(path, params = {})
174
+ http.get((@uri + path).to_s, :params => params, :ssl_context => @ctx)
175
+ end
176
+
177
+ # 发送 post 请求
178
+ def post(path, params = {})
179
+ http.post((@uri + path).to_s, :form => params, :ssl_context => @ctx)
180
+ end
181
+
182
+ #
183
+ # init_uri 如果未初始化@uri,则会报错
184
+ # 继承类需要重定义 init_uri
185
+ #
186
+ def initialize
187
+ # 初始化 uri
188
+ raise "Client uri为空" unless init_uri
189
+
190
+ # 初始化超时时间
191
+ init_timeout
192
+
193
+ # 初始化 ssl 协议
194
+ init_ssl
195
+
196
+ # 初始化一些 client 自定义参数
197
+ init_client
198
+
199
+ # 初始化 代理参数
200
+ @proxy_params = {key: "#{self.class}"}
86
201
  end
87
202
 
88
203
  end
89
- end
204
+ end
205
+
@@ -0,0 +1,52 @@
1
+ module HTTP
2
+ class Response
3
+
4
+ # 解压并转码 body 数据
5
+ def decoding_body
6
+ @decoding_body ||= self.to_s
7
+ end
8
+
9
+ # def decoding_body
10
+
11
+ def html
12
+ @html ||= Nokogiri::HTML(decoding_body)
13
+ end
14
+
15
+ def json
16
+ @json ||= JSON.parse(decoding_body)
17
+ @json = JSON.parse(@json) if String === @json
18
+ @json
19
+ end
20
+
21
+ # 通过readability 解析数据
22
+ def readability
23
+ @readability ||= Readability::Document.new(decoding_body, {do_not_guess_encoding: true})
24
+ end
25
+
26
+ # 解析
27
+ def parsing
28
+ json
29
+ end
30
+
31
+ # 获取解析结果
32
+ def results
33
+ @results ||= parsing
34
+ end
35
+
36
+ def get_date(str)
37
+ time = Time.now
38
+ case str
39
+ when /^(\d{1,2})小时前$/
40
+ time = time - $1.to_i.hours
41
+ when /^(\d{1,2})月(\d{1,2})日$/
42
+ time = Time.local(time.year, $1.to_i, $2.to_i)
43
+ when /^(\d{4})年(\d{1,2})月(\d{1,2})日$/
44
+ time = Time.local($1.to_i, $2.to_i, $3.to_i)
45
+ when /^(\d{1,2})月(\d{1,2})日[ ]{0,3}(\d{1,2}):(\d{1,2})$/ # 09月30日 12:04
46
+ time = Time.local(time.year, $1.to_i, $2.to_i, $3.to_i, $4.to_i)
47
+ end
48
+ return time
49
+ end
50
+
51
+ end # class Net::HTTPResponse
52
+ end
@@ -158,7 +158,9 @@ module HttpCrawler
158
158
  response.error!
159
159
  end
160
160
  else
161
- server_error_sleep
161
+ Rails.logger.debug uri_or_path
162
+ Rails.logger.debug initheader
163
+ Rails.logger.debug response.body
162
164
  response.error!
163
165
  end
164
166
  end
@@ -255,5 +257,4 @@ module HttpCrawler
255
257
  end
256
258
 
257
259
 
258
- load File.dirname(__FILE__) + '/net/http.rb'
259
- load File.dirname(__FILE__) + '/net/response.rb'
260
+ load File.dirname(__FILE__) + '/http/response.rb'
@@ -22,7 +22,7 @@ module HttpCrawler
22
22
 
23
23
  # http://39.108.59.38:7772/Tools/proxyIP.ashx?OrderNumber=ccd4c8912691f28861a1ed048fec88dc&poolIndex=22717&cache=1&qty=2
24
24
  def get_proxy(parameter = {})
25
- r = http.get_fetch("/api/get_proxy")
25
+ r = http.get("/api/get_proxy")
26
26
  r.extend(HttpCrawler::Proxy::TestProxyApi::Response::GetProxy)
27
27
  end
28
28
 
@@ -1,3 +1,3 @@
1
1
  module HttpCrawler
2
- VERSION = "0.2.3.3"
2
+ VERSION = "0.3.0.0"
3
3
  end
@@ -6,20 +6,24 @@ module HttpCrawler
6
6
 
7
7
  include(HttpCrawler::Client)
8
8
 
9
- def init_http
10
- @http.open_timeout = 3
11
- @http.read_timeout = 3
12
- end
13
-
14
9
  def init_uri
15
- @uri = URI("https://www.baidu.com/")
10
+ @uri = URI("https://www.baidu.com")
16
11
  end
17
12
 
18
13
  def index(parameter = {})
19
- r = http.get_fetch("/", header)
14
+ r = get("/")
20
15
  r.extend(HttpCrawler::Web::Baidu::Response::Index)
21
16
  end
22
17
 
18
+ def search(parameter = {})
19
+ raise "parameter[:keyword] 不能为空" unless parameter[:keyword]
20
+ params = {
21
+ "wd": parameter[:keyword]
22
+ }
23
+ r = get("/s",params)
24
+ r
25
+ end
26
+
23
27
  end
24
28
  end # module Baidu
25
29
  end # module Web
data/lib/http_crawler.rb CHANGED
@@ -4,7 +4,6 @@ require 'digest/md5'
4
4
  require 'nokogiri'
5
5
 
6
6
  load 'http_crawler/common.rb'
7
- load 'http_crawler/http.rb'
8
7
  load 'http_crawler/client.rb'
9
8
  load 'http_crawler/web.rb'
10
9
  load 'http_crawler/proxy.rb'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: http_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3.3
4
+ version: 0.3.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - jagger
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-02-12 00:00:00.000000000 Z
11
+ date: 2019-02-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -118,6 +118,7 @@ files:
118
118
  - ".gitignore"
119
119
  - ".idea/.rakeTasks"
120
120
  - ".idea/http_crawler.iml"
121
+ - ".idea/inspectionProfiles/Project_Default.xml"
121
122
  - ".idea/misc.xml"
122
123
  - ".idea/modules.xml"
123
124
  - ".idea/vcs.xml"
@@ -136,8 +137,7 @@ files:
136
137
  - lib/http_crawler/common/object.rb
137
138
  - lib/http_crawler/common/string.rb
138
139
  - lib/http_crawler/http.rb
139
- - lib/http_crawler/net/http.rb
140
- - lib/http_crawler/net/response.rb
140
+ - lib/http_crawler/http/response.rb
141
141
  - lib/http_crawler/proxy.rb
142
142
  - lib/http_crawler/proxy/README.md
143
143
  - lib/http_crawler/proxy/client.rb
@@ -1,7 +0,0 @@
1
- module Net
2
- class HTTP
3
-
4
-
5
- end # class HTTP
6
- end # module Net
7
-
@@ -1,105 +0,0 @@
1
- module Net
2
- class HTTPResponse
3
-
4
- # 解压并转码 body 数据
5
- def decoding_body
6
-
7
- return @decoding_body if @decoding_body
8
- return nil unless body
9
-
10
- # 数据解压
11
- case header['Content-Encoding']
12
- when 'gzip' then
13
- sio = StringIO.new(body)
14
- gz = Zlib::GzipReader.new(sio)
15
- @decoding_body = gz.read()
16
- when 'br'
17
- @decoding_body = Brotli.inflate(body)
18
- when 'deflate'
19
- # 可能错误代码 暂时没解决 deflate 编码格式
20
- @decoding_body = Zlib::Inflate.inflate(body)
21
- else
22
- @decoding_body = body
23
- end
24
-
25
- # 判断解压后数据编码格式
26
-
27
- # 从header取编码格式
28
- encoding = header['Content-Type'][/charset=([^, ;"]*)/, 1] if header['Content-Type']
29
-
30
- # 从html中的 charset 取编码格式
31
- encoding = @decoding_body[/charset=([^, ;"]*)/, 1] unless encoding
32
-
33
- # 通过 CharDet 判断编码格式
34
- encoding = CharDet.detect(@decoding_body)["encoding"] unless encoding
35
-
36
-
37
- # 进行转码
38
- begin
39
- @decoding_body.force_encoding(encoding).encode!('utf-8') if encoding && encoding != @decoding_body.encoding
40
- rescue => e
41
- # 转码错误后再次使用 CharDet 判断编码格式后进行转码
42
- cd = CharDet.detect(@decoding_body)["encoding"]
43
- if (cd && cd != encoding)
44
- @decoding_body.force_encoding(cd).encode!('utf-8') if encoding != @decoding_body.encoding
45
- else
46
- # 还是转码错误则抛出异常
47
- Rails.logger.debug "encoding => #{encoding}"
48
- Rails.logger.debug "cd => #{cd}"
49
- Rails.logger.debug "@decoding_body[0..200] => #{@decoding_body[0..200]}"
50
- raise e
51
- end
52
- end
53
-
54
- @decoding_body
55
- end
56
-
57
- # def decoding_body
58
-
59
- def html
60
- @html ||= Nokogiri::HTML(decoding_body)
61
- end
62
-
63
- def json
64
- @json ||= JSON.parse(decoding_body)
65
- @json = JSON.parse(@json) if String === @json
66
- @json
67
- end
68
-
69
- # 通过readability 解析数据
70
- def readability
71
- @readability ||= Readability::Document.new(decoding_body, {do_not_guess_encoding: true})
72
- end
73
-
74
- # 解析
75
- def parsing
76
- json
77
- end
78
-
79
- # 获取解析结果
80
- def results
81
- @results ||= parsing
82
- end
83
-
84
- def get_date(str)
85
- time = Time.now
86
- case str
87
- when /^(\d{1,2})小时前$/
88
- time = time - $1.to_i.hours
89
- when /^(\d{1,2})月(\d{1,2})日$/
90
- time = Time.local(time.year, $1.to_i, $2.to_i)
91
- when /^(\d{4})年(\d{1,2})月(\d{1,2})日$/
92
- time = Time.local($1.to_i, $2.to_i, $3.to_i)
93
- when /^(\d{1,2})月(\d{1,2})日[ ]{0,3}(\d{1,2}):(\d{1,2})$/ # 09月30日 12:04
94
- time = Time.local(time.year, $1.to_i, $2.to_i, $3.to_i, $4.to_i)
95
- end
96
- return time
97
- end
98
-
99
-
100
- # 是否是网站验证 true表示正常数据、false表示弹出网站验证
101
- def web_verify(*arg)
102
- true
103
- end
104
- end # class Net::HTTPResponse
105
- end