http_crawler 0.2.3.3 → 0.3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1143552ea7737865d9fe582c48b7fa3f5824da47cbb0e16d8911050bd807dc68
4
- data.tar.gz: 0e55bdd075e617e1f60e9fb6a54f75366c7cc670801d6a49b48c007e612b00a3
3
+ metadata.gz: 870f70e609b513a6bdf5176e1ab2960d19e7b4f04d60c746845e6e27343288c7
4
+ data.tar.gz: 6706312c08462cb4dee0f8c2eac15b7882131832dc49f6cb336032ffdc0ab68d
5
5
  SHA512:
6
- metadata.gz: e26f0d48ec8318b0d977933d95f091a8e9ce00ef872ef8668d66dcbacc653e984871210b132468b530fd7d30261bb2caae592b93f7d0fa73067432c7b4b38750
7
- data.tar.gz: 98004bfd606c248e367d18bac5cea84304a5f83854d9c9ac4adedbbf5ee84a6533f6d909ffa73afaf66f0313605093f84da2bde25c78c3e6ed60b86e7e8c71f7
6
+ metadata.gz: 3c47791191aca7f3065eee2c9e223e3196f0890a4da8cd4bf8dee697eae18133758ac3225d49503d6ece004bc55474bbd9486b140b2d1588c92ee23d8b06663d
7
+ data.tar.gz: ff4da1b65de1431b9e6aa800e63d9e0b4c97ee942b68760136deb3bd819c08fc82504159e1a1ea14b6368458e8fa6a7568521a12d0eff259c637e2092791ea6f
@@ -0,0 +1,6 @@
1
+ <component name="InspectionProjectProfileManager">
2
+ <profile version="1.0">
3
+ <option name="myName" value="Project Default" />
4
+ <inspection_tool class="Rubocop" enabled="false" level="WARNING" enabled_by_default="false" />
5
+ </profile>
6
+ </component>
data/.idea/workspace.xml CHANGED
@@ -2,6 +2,7 @@
2
2
  <project version="4">
3
3
  <component name="ChangeListManager">
4
4
  <list default="true" id="07223dd4-8944-486b-a29b-7461a5c9ec2d" name="Default" comment="">
5
+ <change afterPath="$PROJECT_DIR$/.idea/inspectionProfiles/Project_Default.xml" afterDir="false" />
5
6
  <change beforePath="$PROJECT_DIR$/.idea/http_crawler.iml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/http_crawler.iml" afterDir="false" />
6
7
  <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
7
8
  <change beforePath="$PROJECT_DIR$/lib/http_crawler/http.rb" beforeDir="false" afterPath="$PROJECT_DIR$/lib/http_crawler/http.rb" afterDir="false" />
@@ -17,47 +18,11 @@
17
18
  </component>
18
19
  <component name="FileEditorManager">
19
20
  <leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
20
- <file leaf-file-name="version.rb" pinned="false" current-in-tab="true">
21
- <entry file="file://$PROJECT_DIR$/lib/http_crawler/version.rb">
22
- <provider selected="true" editor-type-id="text-editor">
23
- <state relative-caret-position="21">
24
- <caret line="1" column="21" selection-start-line="1" selection-start-column="21" selection-end-line="1" selection-end-column="21" />
25
- </state>
26
- </provider>
27
- </entry>
28
- </file>
29
- <file leaf-file-name="web.rb" pinned="false" current-in-tab="false">
30
- <entry file="file://$PROJECT_DIR$/lib/http_crawler/web.rb">
31
- <provider selected="true" editor-type-id="text-editor">
32
- <state relative-caret-position="126">
33
- <caret line="6" selection-start-line="6" selection-end-line="6" />
34
- </state>
35
- </provider>
36
- </entry>
37
- </file>
38
21
  <file leaf-file-name="http.rb" pinned="false" current-in-tab="false">
39
22
  <entry file="file://$PROJECT_DIR$/lib/http_crawler/http.rb">
40
23
  <provider selected="true" editor-type-id="text-editor">
41
- <state relative-caret-position="4410">
42
- <caret line="210" column="11" lean-forward="true" selection-start-line="210" selection-start-column="11" selection-end-line="210" selection-end-column="11" />
43
- </state>
44
- </provider>
45
- </entry>
46
- </file>
47
- <file leaf-file-name="compat.rb" pinned="false" current-in-tab="false">
48
- <entry file="file://$USER_HOME$/.rvm/rubies/ruby-2.4.1/lib/ruby/2.4.0/webrick/compat.rb">
49
- <provider selected="true" editor-type-id="text-editor">
50
- <state relative-caret-position="378">
51
- <caret line="18" column="1" selection-start-line="18" selection-start-column="1" selection-end-line="18" selection-end-column="1" />
52
- </state>
53
- </provider>
54
- </entry>
55
- </file>
56
- <file leaf-file-name="errno.rb" pinned="false" current-in-tab="false">
57
- <entry file="file://$APPLICATION_HOME_DIR$/rubystubs24/errno.rb">
58
- <provider selected="true" editor-type-id="text-editor">
59
- <state relative-caret-position="672">
60
- <caret line="32" column="7" selection-start-line="32" selection-start-column="7" selection-end-line="32" selection-end-column="7" />
24
+ <state relative-caret-position="504">
25
+ <caret line="24" column="22" selection-start-line="24" selection-start-column="22" selection-end-line="24" selection-end-column="22" />
61
26
  </state>
62
27
  </provider>
63
28
  </entry>
@@ -71,11 +36,11 @@
71
36
  </provider>
72
37
  </entry>
73
38
  </file>
74
- <file leaf-file-name="proxy.rb" pinned="false" current-in-tab="false">
75
- <entry file="file://$PROJECT_DIR$/lib/http_crawler/proxy.rb">
39
+ <file leaf-file-name="client.rb" pinned="false" current-in-tab="false">
40
+ <entry file="file://$PROJECT_DIR$/lib/http_crawler/client.rb">
76
41
  <provider selected="true" editor-type-id="text-editor">
77
- <state relative-caret-position="441">
78
- <caret line="21" selection-start-line="21" selection-end-line="21" />
42
+ <state relative-caret-position="483">
43
+ <caret line="23" column="19" selection-start-line="23" selection-start-column="19" selection-end-line="23" selection-end-column="19" />
79
44
  </state>
80
45
  </provider>
81
46
  </entry>
@@ -83,17 +48,17 @@
83
48
  <file leaf-file-name="client.rb" pinned="false" current-in-tab="false">
84
49
  <entry file="file://$PROJECT_DIR$/lib/http_crawler/web/baidu/client.rb">
85
50
  <provider selected="true" editor-type-id="text-editor">
86
- <state relative-caret-position="21">
87
- <caret line="1" column="18" selection-start-line="1" selection-start-column="7" selection-end-line="1" selection-end-column="18" />
51
+ <state relative-caret-position="105">
52
+ <caret line="5" selection-start-line="5" selection-end-line="5" />
88
53
  </state>
89
54
  </provider>
90
55
  </entry>
91
56
  </file>
92
- <file leaf-file-name="Gemfile" pinned="false" current-in-tab="false">
93
- <entry file="file://$PROJECT_DIR$/Gemfile">
57
+ <file leaf-file-name="response.rb" pinned="false" current-in-tab="true">
58
+ <entry file="file://$PROJECT_DIR$/lib/http_crawler/net/response.rb">
94
59
  <provider selected="true" editor-type-id="text-editor">
95
- <state relative-caret-position="168">
96
- <caret line="8" column="23" selection-start-line="8" selection-start-column="23" selection-end-line="8" selection-end-column="23" />
60
+ <state relative-caret-position="65">
61
+ <caret line="4" column="21" selection-start-line="4" selection-start-column="8" selection-end-line="4" selection-end-column="21" />
97
62
  </state>
98
63
  </provider>
99
64
  </entry>
@@ -104,6 +69,8 @@
104
69
  <findStrings>
105
70
  <find>Crawler::Web</find>
106
71
  <find>&quot;Crawler</find>
72
+ <find>proxy</find>
73
+ <find>auto</find>
107
74
  </findStrings>
108
75
  <replaceStrings>
109
76
  <replace>HttpCrawler::Web</replace>
@@ -143,8 +110,8 @@
143
110
  <option value="$PROJECT_DIR$/lib/http_crawler/web/baidu/client.rb" />
144
111
  <option value="$PROJECT_DIR$/lib/http_crawler/proxy.rb" />
145
112
  <option value="$PROJECT_DIR$/lib/http_crawler/web.rb" />
146
- <option value="$PROJECT_DIR$/lib/http_crawler/http.rb" />
147
113
  <option value="$PROJECT_DIR$/lib/http_crawler/version.rb" />
114
+ <option value="$PROJECT_DIR$/lib/http_crawler/http.rb" />
148
115
  </list>
149
116
  </option>
150
117
  </component>
@@ -167,7 +134,6 @@
167
134
  <foldersAlwaysOnTop value="true" />
168
135
  </navigator>
169
136
  <panes>
170
- <pane id="Scope" />
171
137
  <pane id="ProjectPane">
172
138
  <subPane>
173
139
  <expand>
@@ -188,23 +154,23 @@
188
154
  </path>
189
155
  <path>
190
156
  <item name="http_crawler" type="b2602c69:ProjectViewProjectNode" />
191
- <item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
192
- </path>
193
- <path>
194
- <item name="http_crawler" type="b2602c69:ProjectViewProjectNode" />
195
- <item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
196
- <item name="&lt; RVM: ruby-2.4.1 &gt;" type="70bed36:NamedLibraryElementNode" />
157
+ <item name="http_crawler" type="462c0819:PsiDirectoryNode" />
158
+ <item name="lib" type="462c0819:PsiDirectoryNode" />
159
+ <item name="http_crawler" type="462c0819:PsiDirectoryNode" />
160
+ <item name="net" type="462c0819:PsiDirectoryNode" />
197
161
  </path>
198
162
  <path>
199
163
  <item name="http_crawler" type="b2602c69:ProjectViewProjectNode" />
200
- <item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
201
- <item name="&lt; RVM: ruby-2.4.1 &gt;" type="70bed36:NamedLibraryElementNode" />
202
- <item name="rubystubs24" type="462c0819:PsiDirectoryNode" />
164
+ <item name="http_crawler" type="462c0819:PsiDirectoryNode" />
165
+ <item name="lib" type="462c0819:PsiDirectoryNode" />
166
+ <item name="http_crawler" type="462c0819:PsiDirectoryNode" />
167
+ <item name="web" type="462c0819:PsiDirectoryNode" />
203
168
  </path>
204
169
  </expand>
205
170
  <select />
206
171
  </subPane>
207
172
  </pane>
173
+ <pane id="Scope" />
208
174
  </panes>
209
175
  </component>
210
176
  <component name="PropertiesComponent">
@@ -251,18 +217,19 @@
251
217
  <workItem from="1546240992243" duration="719000" />
252
218
  <workItem from="1546291493927" duration="464000" />
253
219
  <workItem from="1546436457874" duration="2443000" />
254
- <workItem from="1549964225949" duration="5000" />
220
+ <workItem from="1549964225949" duration="1209000" />
221
+ <workItem from="1550132724592" duration="3006000" />
222
+ <workItem from="1550208979012" duration="304000" />
255
223
  </task>
256
224
  <servers />
257
225
  </component>
258
226
  <component name="TimeTrackingManager">
259
- <option name="totallyTimeSpent" value="23113000" />
227
+ <option name="totallyTimeSpent" value="27627000" />
260
228
  </component>
261
229
  <component name="ToolWindowManager">
262
230
  <frame x="0" y="0" width="1680" height="1050" extended-state="0" />
263
- <editor active="true" />
264
231
  <layout>
265
- <window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.34188035" />
232
+ <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.35042736" />
266
233
  <window_info anchor="bottom" id="TODO" order="6" />
267
234
  <window_info anchor="bottom" id="Docker" order="7" show_stripe_button="false" />
268
235
  <window_info anchor="bottom" id="Event Log" order="7" side_tool="true" />
@@ -270,7 +237,7 @@
270
237
  <window_info anchor="bottom" id="Database Changes" order="7" show_stripe_button="false" />
271
238
  <window_info anchor="bottom" id="Version Control" order="7" />
272
239
  <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
273
- <window_info active="true" anchor="bottom" id="Terminal" order="7" visible="true" weight="0.45725647" />
240
+ <window_info anchor="bottom" id="Terminal" order="7" weight="0.45725647" />
274
241
  <window_info id="Favorites" order="2" side_tool="true" />
275
242
  <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
276
243
  <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
@@ -508,13 +475,6 @@
508
475
  </state>
509
476
  </provider>
510
477
  </entry>
511
- <entry file="file://$PROJECT_DIR$/lib/http_crawler/net/response.rb">
512
- <provider selected="true" editor-type-id="text-editor">
513
- <state relative-caret-position="90">
514
- <caret line="6" column="45" lean-forward="true" selection-start-line="6" selection-start-column="45" selection-end-line="6" selection-end-column="45" />
515
- </state>
516
- </provider>
517
- </entry>
518
478
  <entry file="file://$PROJECT_DIR$/lib/http_crawler.rb">
519
479
  <provider selected="true" editor-type-id="text-editor">
520
480
  <state relative-caret-position="120">
@@ -550,13 +510,6 @@
550
510
  </state>
551
511
  </provider>
552
512
  </entry>
553
- <entry file="file://$PROJECT_DIR$/lib/http_crawler/client.rb">
554
- <provider selected="true" editor-type-id="text-editor">
555
- <state relative-caret-position="90">
556
- <caret line="6" column="36" selection-start-line="6" selection-start-column="36" selection-end-line="6" selection-end-column="36" />
557
- </state>
558
- </provider>
559
- </entry>
560
513
  <entry file="file://$PROJECT_DIR$/lib/http_crawler/common.rb">
561
514
  <provider selected="true" editor-type-id="text-editor">
562
515
  <state relative-caret-position="42">
@@ -564,13 +517,6 @@
564
517
  </state>
565
518
  </provider>
566
519
  </entry>
567
- <entry file="file://$PROJECT_DIR$/lib/http_crawler/proxy.rb">
568
- <provider selected="true" editor-type-id="text-editor">
569
- <state relative-caret-position="441">
570
- <caret line="21" selection-start-line="21" selection-end-line="21" />
571
- </state>
572
- </provider>
573
- </entry>
574
520
  <entry file="file://$PROJECT_DIR$/lib/http_crawler/web.rb">
575
521
  <provider selected="true" editor-type-id="text-editor">
576
522
  <state relative-caret-position="126">
@@ -578,13 +524,6 @@
578
524
  </state>
579
525
  </provider>
580
526
  </entry>
581
- <entry file="file://$PROJECT_DIR$/lib/http_crawler/web/baidu/client.rb">
582
- <provider selected="true" editor-type-id="text-editor">
583
- <state relative-caret-position="21">
584
- <caret line="1" column="18" selection-start-line="1" selection-start-column="7" selection-end-line="1" selection-end-column="18" />
585
- </state>
586
- </provider>
587
- </entry>
588
527
  <entry file="file://$PROJECT_DIR$/Gemfile">
589
528
  <provider selected="true" editor-type-id="text-editor">
590
529
  <state relative-caret-position="168">
@@ -599,13 +538,6 @@
599
538
  </state>
600
539
  </provider>
601
540
  </entry>
602
- <entry file="file://$PROJECT_DIR$/lib/http_crawler/http.rb">
603
- <provider selected="true" editor-type-id="text-editor">
604
- <state relative-caret-position="4410">
605
- <caret line="210" column="11" lean-forward="true" selection-start-line="210" selection-start-column="11" selection-end-line="210" selection-end-column="11" />
606
- </state>
607
- </provider>
608
- </entry>
609
541
  <entry file="file://$APPLICATION_HOME_DIR$/rubystubs24/system_call_error.rb">
610
542
  <provider selected="true" editor-type-id="text-editor">
611
543
  <state relative-caret-position="150">
@@ -622,8 +554,43 @@
622
554
  </entry>
623
555
  <entry file="file://$PROJECT_DIR$/lib/http_crawler/version.rb">
624
556
  <provider selected="true" editor-type-id="text-editor">
625
- <state relative-caret-position="21">
626
- <caret line="1" column="21" selection-start-line="1" selection-start-column="21" selection-end-line="1" selection-end-column="21" />
557
+ <state relative-caret-position="63">
558
+ <caret line="3" selection-start-line="3" selection-end-line="3" />
559
+ </state>
560
+ </provider>
561
+ </entry>
562
+ <entry file="file://$PROJECT_DIR$/lib/http_crawler/proxy.rb">
563
+ <provider selected="true" editor-type-id="text-editor">
564
+ <state relative-caret-position="441">
565
+ <caret line="21" selection-start-line="21" selection-end-line="21" />
566
+ </state>
567
+ </provider>
568
+ </entry>
569
+ <entry file="file://$PROJECT_DIR$/lib/http_crawler/client.rb">
570
+ <provider selected="true" editor-type-id="text-editor">
571
+ <state relative-caret-position="483">
572
+ <caret line="23" column="19" selection-start-line="23" selection-start-column="19" selection-end-line="23" selection-end-column="19" />
573
+ </state>
574
+ </provider>
575
+ </entry>
576
+ <entry file="file://$PROJECT_DIR$/lib/http_crawler/web/baidu/client.rb">
577
+ <provider selected="true" editor-type-id="text-editor">
578
+ <state relative-caret-position="105">
579
+ <caret line="5" selection-start-line="5" selection-end-line="5" />
580
+ </state>
581
+ </provider>
582
+ </entry>
583
+ <entry file="file://$PROJECT_DIR$/lib/http_crawler/http.rb">
584
+ <provider selected="true" editor-type-id="text-editor">
585
+ <state relative-caret-position="504">
586
+ <caret line="24" column="22" selection-start-line="24" selection-start-column="22" selection-end-line="24" selection-end-column="22" />
587
+ </state>
588
+ </provider>
589
+ </entry>
590
+ <entry file="file://$PROJECT_DIR$/lib/http_crawler/net/response.rb">
591
+ <provider selected="true" editor-type-id="text-editor">
592
+ <state relative-caret-position="65">
593
+ <caret line="4" column="21" selection-start-line="4" selection-start-column="8" selection-end-line="4" selection-end-column="21" />
627
594
  </state>
628
595
  </provider>
629
596
  </entry>
@@ -1,3 +1,5 @@
1
+ load File.dirname(__FILE__) + '/http/response.rb'
2
+
1
3
  module HttpCrawler
2
4
  module Client
3
5
 
@@ -21,43 +23,32 @@ module HttpCrawler
21
23
  end
22
24
  end
23
25
 
24
- attr_reader :http, :uri
26
+ attr_reader :uri
25
27
 
26
- #
27
28
  # init_uri 如果未初始化@uri,则会报错
28
- # 继承类需要重定义 init_uri
29
+ # 继承类需要实现 @uri = URI("http://host")
29
30
  #
30
- def initialize
31
- raise "Client uri为空" unless init_uri
32
- @http = HttpCrawler::HTTP.new(uri.host, uri.port)
33
-
34
- @http.use_ssl = (uri.scheme == "https")
35
-
36
- @http.open_timeout = 5
37
- @http.read_timeout = 5
38
- @http.proxy_key = "#{self.class}"
39
- init_http
40
-
41
- Rails.logger.debug "proxy_key => #{@http.proxy_key}"
42
- end
43
-
44
- # 初始化http参数
45
- def init_http
46
-
31
+ def init_uri
32
+ @uri = nil
47
33
  end
48
34
 
49
- # 添加错误的url地址,表示这里面的url都是异常地址,存的是正则
50
- def add_error_url(url_string)
51
- @http.error_urls << url_string
35
+ # 初始化超时时间
36
+ def init_timeout
37
+ @connect_time = 5
38
+ @write_time = 2
39
+ @read_time = 5
52
40
  end
53
41
 
54
- # init_uri 如果未初始化@uri,则会报错
55
- # 继承类需要实现 @uri = URI("http://host")
56
- #
57
- def init_uri
58
- @uri = nil
42
+ # 初始化 ssl 协议
43
+ def init_ssl
44
+ if (@uri.scheme == "https")
45
+ # ssl 协议
46
+ @ctx = OpenSSL::SSL::SSLContext.new
47
+ @ctx.verify_mode = OpenSSL::SSL::VERIFY_NONE
48
+ end
59
49
  end
60
50
 
51
+ # 头文件相关方法
61
52
  def header
62
53
  @header ||= init_header
63
54
  end
@@ -70,20 +61,145 @@ module HttpCrawler
70
61
  nil
71
62
  end
72
63
 
73
- def update_proxy(proxy = {})
74
- @http.update_proxy(proxy)
64
+ # cookies
65
+ def cookies
66
+ @cookies ||= {}
75
67
  end
76
68
 
69
+
70
+ # 代理设置
77
71
  def auto_proxy=(value)
78
72
  Rails.logger.debug "自动更新代理"
79
- @http.auto_proxy = value
80
- @http.update_proxy if (value == true && @http.proxy? == false)
73
+ @auto_proxy = value
74
+ update_proxy if (value == true && @proxy.blank?)
75
+ end
76
+
77
+ # 代理使用的api方法名
78
+ def proxy_api
79
+ @proxy_api ||= "my"
80
+ end
81
+
82
+ # 调用代理 api使用的参数
83
+ def proxy_params
84
+ @proxy_params ||= {"key": "default"}
85
+ end
86
+
87
+ def update_proxy(proxy = {})
88
+ if (proxy.blank?)
89
+ @proxy = get_proxy
90
+ else
91
+ @proxy = proxy
92
+ end
93
+ # @http.update_proxy(proxy)
94
+ end
95
+
96
+
97
+ # 如果自动更新代理 则更新代理返回 true,否则返回false
98
+ def update_proxy?(proxy_ip = {})
99
+ if @auto_proxy
100
+ update_proxy(proxy_ip)
101
+ return true
102
+ else
103
+ return false
104
+ end
81
105
  end
82
106
 
83
- # 是否验证码界面
84
- def validation_page?(*arg)
85
- false
107
+
108
+ # 获取proxy
109
+ # 通过调用 api 获取代理或者通过自定义设置代理
110
+ def get_proxy
111
+ proxy_ip = nil
112
+ begin
113
+ Rails.logger.debug("开始获取代理IP")
114
+ proxy_client = HttpCrawler::Proxy.for(proxy_api)
115
+ proxy_r = proxy_client.get_proxy(proxy_params)
116
+ proxy_ip = proxy_r.results unless proxy_r.results.blank?
117
+ if proxy_ip.blank?
118
+ Rails.logger.warn "无最新代理等待5秒后重新获取"
119
+ else
120
+ break
121
+ end
122
+ sleep(5)
123
+ end while true
124
+
125
+ Rails.logger.debug("当前IP => #{@proxy},获取最新代理 => #{proxy_ip}")
126
+
127
+ unless proxy_ip && proxy_ip["p_addr"] && proxy_ip["p_port"]
128
+ Rails.logger.warn "无最新代理等待5秒后重新获取"
129
+ sleep(5)
130
+ proxy_ip = get_proxy
131
+ end
132
+
133
+ if (@proxy && proxy_ip && @proxy["p_addr"] == proxy_ip["p_addr"] && @proxy["p_port"] == proxy_ip["p_port"])
134
+ Rails.logger.warn "无最新代理等待5秒后重新获取"
135
+ sleep(5)
136
+ proxy_ip = get_proxy
137
+ end
138
+ proxy_ip
139
+ end
140
+
141
+ # 添加错误的url地址,表示这里面的url都是异常地址,存的是正则
142
+ def add_error_url(url_string)
143
+ @http.error_urls << url_string
144
+ end
145
+
146
+
147
+ # 初始化http参数
148
+ def init_client
149
+
150
+ end
151
+
152
+ # 初始化http请求前置条件
153
+ def http
154
+ # 自动重定向。最大重定向次数 max_hops: 5
155
+ h = HTTP.follow(max_hops: 5)
156
+
157
+ # 添加代理
158
+ h = h.via(@proxy["p_addr"], @proxy["p_port"].to_i, @proxy["p_user"], @proxy["p_pass"]) unless (@proxy.blank?)
159
+
160
+ # 添加头文件
161
+ h = h.headers(header) if header
162
+
163
+ # 添加cookies
164
+ h = h.cookies(cookies) if cookies
165
+
166
+ # 添加超时时间
167
+ h = h.timeout(connect: @connect_time, write: @write_time, read: @read_time)
168
+
169
+ h
170
+ end
171
+
172
+ # 发送 get 请求
173
+ def get(path, params = {})
174
+ http.get((@uri + path).to_s, :params => params, :ssl_context => @ctx)
175
+ end
176
+
177
+ # 发送 post 请求
178
+ def post(path, params = {})
179
+ http.post((@uri + path).to_s, :form => params, :ssl_context => @ctx)
180
+ end
181
+
182
+ #
183
+ # init_uri 如果未初始化@uri,则会报错
184
+ # 继承类需要重定义 init_uri
185
+ #
186
+ def initialize
187
+ # 初始化 uri
188
+ raise "Client uri为空" unless init_uri
189
+
190
+ # 初始化超时时间
191
+ init_timeout
192
+
193
+ # 初始化 ssl 协议
194
+ init_ssl
195
+
196
+ # 初始化一些 client 自定义参数
197
+ init_client
198
+
199
+ # 初始化 代理参数
200
+ @proxy_params = {key: "#{self.class}"}
86
201
  end
87
202
 
88
203
  end
89
- end
204
+ end
205
+
@@ -0,0 +1,52 @@
1
+ module HTTP
2
+ class Response
3
+
4
+ # 解压并转码 body 数据
5
+ def decoding_body
6
+ @decoding_body ||= self.to_s
7
+ end
8
+
9
+ # def decoding_body
10
+
11
+ def html
12
+ @html ||= Nokogiri::HTML(decoding_body)
13
+ end
14
+
15
+ def json
16
+ @json ||= JSON.parse(decoding_body)
17
+ @json = JSON.parse(@json) if String === @json
18
+ @json
19
+ end
20
+
21
+ # 通过readability 解析数据
22
+ def readability
23
+ @readability ||= Readability::Document.new(decoding_body, {do_not_guess_encoding: true})
24
+ end
25
+
26
+ # 解析
27
+ def parsing
28
+ json
29
+ end
30
+
31
+ # 获取解析结果
32
+ def results
33
+ @results ||= parsing
34
+ end
35
+
36
+ def get_date(str)
37
+ time = Time.now
38
+ case str
39
+ when /^(\d{1,2})小时前$/
40
+ time = time - $1.to_i.hours
41
+ when /^(\d{1,2})月(\d{1,2})日$/
42
+ time = Time.local(time.year, $1.to_i, $2.to_i)
43
+ when /^(\d{4})年(\d{1,2})月(\d{1,2})日$/
44
+ time = Time.local($1.to_i, $2.to_i, $3.to_i)
45
+ when /^(\d{1,2})月(\d{1,2})日[ ]{0,3}(\d{1,2}):(\d{1,2})$/ # 09月30日 12:04
46
+ time = Time.local(time.year, $1.to_i, $2.to_i, $3.to_i, $4.to_i)
47
+ end
48
+ return time
49
+ end
50
+
51
+ end # class Net::HTTPResponse
52
+ end
@@ -158,7 +158,9 @@ module HttpCrawler
158
158
  response.error!
159
159
  end
160
160
  else
161
- server_error_sleep
161
+ Rails.logger.debug uri_or_path
162
+ Rails.logger.debug initheader
163
+ Rails.logger.debug response.body
162
164
  response.error!
163
165
  end
164
166
  end
@@ -255,5 +257,4 @@ module HttpCrawler
255
257
  end
256
258
 
257
259
 
258
- load File.dirname(__FILE__) + '/net/http.rb'
259
- load File.dirname(__FILE__) + '/net/response.rb'
260
+ load File.dirname(__FILE__) + '/http/response.rb'
@@ -22,7 +22,7 @@ module HttpCrawler
22
22
 
23
23
  # http://39.108.59.38:7772/Tools/proxyIP.ashx?OrderNumber=ccd4c8912691f28861a1ed048fec88dc&poolIndex=22717&cache=1&qty=2
24
24
  def get_proxy(parameter = {})
25
- r = http.get_fetch("/api/get_proxy")
25
+ r = http.get("/api/get_proxy")
26
26
  r.extend(HttpCrawler::Proxy::TestProxyApi::Response::GetProxy)
27
27
  end
28
28
 
@@ -1,3 +1,3 @@
1
1
  module HttpCrawler
2
- VERSION = "0.2.3.3"
2
+ VERSION = "0.3.0.0"
3
3
  end
@@ -6,20 +6,24 @@ module HttpCrawler
6
6
 
7
7
  include(HttpCrawler::Client)
8
8
 
9
- def init_http
10
- @http.open_timeout = 3
11
- @http.read_timeout = 3
12
- end
13
-
14
9
  def init_uri
15
- @uri = URI("https://www.baidu.com/")
10
+ @uri = URI("https://www.baidu.com")
16
11
  end
17
12
 
18
13
  def index(parameter = {})
19
- r = http.get_fetch("/", header)
14
+ r = get("/")
20
15
  r.extend(HttpCrawler::Web::Baidu::Response::Index)
21
16
  end
22
17
 
18
+ def search(parameter = {})
19
+ raise "parameter[:keyword] 不能为空" unless parameter[:keyword]
20
+ params = {
21
+ "wd": parameter[:keyword]
22
+ }
23
+ r = get("/s",params)
24
+ r
25
+ end
26
+
23
27
  end
24
28
  end # module Baidu
25
29
  end # module Web
data/lib/http_crawler.rb CHANGED
@@ -4,7 +4,6 @@ require 'digest/md5'
4
4
  require 'nokogiri'
5
5
 
6
6
  load 'http_crawler/common.rb'
7
- load 'http_crawler/http.rb'
8
7
  load 'http_crawler/client.rb'
9
8
  load 'http_crawler/web.rb'
10
9
  load 'http_crawler/proxy.rb'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: http_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3.3
4
+ version: 0.3.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - jagger
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-02-12 00:00:00.000000000 Z
11
+ date: 2019-02-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -118,6 +118,7 @@ files:
118
118
  - ".gitignore"
119
119
  - ".idea/.rakeTasks"
120
120
  - ".idea/http_crawler.iml"
121
+ - ".idea/inspectionProfiles/Project_Default.xml"
121
122
  - ".idea/misc.xml"
122
123
  - ".idea/modules.xml"
123
124
  - ".idea/vcs.xml"
@@ -136,8 +137,7 @@ files:
136
137
  - lib/http_crawler/common/object.rb
137
138
  - lib/http_crawler/common/string.rb
138
139
  - lib/http_crawler/http.rb
139
- - lib/http_crawler/net/http.rb
140
- - lib/http_crawler/net/response.rb
140
+ - lib/http_crawler/http/response.rb
141
141
  - lib/http_crawler/proxy.rb
142
142
  - lib/http_crawler/proxy/README.md
143
143
  - lib/http_crawler/proxy/client.rb
@@ -1,7 +0,0 @@
1
- module Net
2
- class HTTP
3
-
4
-
5
- end # class HTTP
6
- end # module Net
7
-
@@ -1,105 +0,0 @@
1
- module Net
2
- class HTTPResponse
3
-
4
- # 解压并转码 body 数据
5
- def decoding_body
6
-
7
- return @decoding_body if @decoding_body
8
- return nil unless body
9
-
10
- # 数据解压
11
- case header['Content-Encoding']
12
- when 'gzip' then
13
- sio = StringIO.new(body)
14
- gz = Zlib::GzipReader.new(sio)
15
- @decoding_body = gz.read()
16
- when 'br'
17
- @decoding_body = Brotli.inflate(body)
18
- when 'deflate'
19
- # 可能错误代码 暂时没解决 deflate 编码格式
20
- @decoding_body = Zlib::Inflate.inflate(body)
21
- else
22
- @decoding_body = body
23
- end
24
-
25
- # 判断解压后数据编码格式
26
-
27
- # 从header取编码格式
28
- encoding = header['Content-Type'][/charset=([^, ;"]*)/, 1] if header['Content-Type']
29
-
30
- # 从html中的 charset 取编码格式
31
- encoding = @decoding_body[/charset=([^, ;"]*)/, 1] unless encoding
32
-
33
- # 通过 CharDet 判断编码格式
34
- encoding = CharDet.detect(@decoding_body)["encoding"] unless encoding
35
-
36
-
37
- # 进行转码
38
- begin
39
- @decoding_body.force_encoding(encoding).encode!('utf-8') if encoding && encoding != @decoding_body.encoding
40
- rescue => e
41
- # 转码错误后再次使用 CharDet 判断编码格式后进行转码
42
- cd = CharDet.detect(@decoding_body)["encoding"]
43
- if (cd && cd != encoding)
44
- @decoding_body.force_encoding(cd).encode!('utf-8') if encoding != @decoding_body.encoding
45
- else
46
- # 还是转码错误则抛出异常
47
- Rails.logger.debug "encoding => #{encoding}"
48
- Rails.logger.debug "cd => #{cd}"
49
- Rails.logger.debug "@decoding_body[0..200] => #{@decoding_body[0..200]}"
50
- raise e
51
- end
52
- end
53
-
54
- @decoding_body
55
- end
56
-
57
- # def decoding_body
58
-
59
- def html
60
- @html ||= Nokogiri::HTML(decoding_body)
61
- end
62
-
63
- def json
64
- @json ||= JSON.parse(decoding_body)
65
- @json = JSON.parse(@json) if String === @json
66
- @json
67
- end
68
-
69
- # 通过readability 解析数据
70
- def readability
71
- @readability ||= Readability::Document.new(decoding_body, {do_not_guess_encoding: true})
72
- end
73
-
74
- # 解析
75
- def parsing
76
- json
77
- end
78
-
79
- # 获取解析结果
80
- def results
81
- @results ||= parsing
82
- end
83
-
84
- def get_date(str)
85
- time = Time.now
86
- case str
87
- when /^(\d{1,2})小时前$/
88
- time = time - $1.to_i.hours
89
- when /^(\d{1,2})月(\d{1,2})日$/
90
- time = Time.local(time.year, $1.to_i, $2.to_i)
91
- when /^(\d{4})年(\d{1,2})月(\d{1,2})日$/
92
- time = Time.local($1.to_i, $2.to_i, $3.to_i)
93
- when /^(\d{1,2})月(\d{1,2})日[ ]{0,3}(\d{1,2}):(\d{1,2})$/ # 09月30日 12:04
94
- time = Time.local(time.year, $1.to_i, $2.to_i, $3.to_i, $4.to_i)
95
- end
96
- return time
97
- end
98
-
99
-
100
- # 是否是网站验证 true表示正常数据、false表示弹出网站验证
101
- def web_verify(*arg)
102
- true
103
- end
104
- end # class Net::HTTPResponse
105
- end