watobo 0.9.21 → 0.9.23

Sign up to get free protection for your applications and to get access to all the features.
Files changed (283) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +46 -1
  3. data/bin/nfq_server.rb +0 -9
  4. data/bin/watobo_gui.rb +3 -13
  5. data/custom-views/prettify-json.rb +9 -18
  6. data/icons/watobo.ico +0 -0
  7. data/icons/watobo.ico.old +0 -0
  8. data/lib/watobo.rb +10 -19
  9. data/lib/watobo/adapters.rb +5 -14
  10. data/lib/watobo/adapters/data_store.rb +50 -59
  11. data/lib/watobo/adapters/file/file_store.rb +287 -296
  12. data/lib/watobo/adapters/file/marshal_store.rb +293 -296
  13. data/lib/watobo/adapters/session_store.rb +5 -14
  14. data/lib/watobo/ca.rb +1 -10
  15. data/lib/watobo/config.rb +197 -206
  16. data/lib/watobo/constants.rb +0 -9
  17. data/lib/watobo/core.rb +3 -12
  18. data/lib/watobo/core/active_check.rb +72 -135
  19. data/lib/watobo/core/active_checks.rb +49 -58
  20. data/lib/watobo/core/ca.rb +369 -389
  21. data/lib/watobo/core/cert_store.rb +34 -43
  22. data/lib/watobo/core/chat.rb +92 -101
  23. data/lib/watobo/core/chats.rb +271 -280
  24. data/lib/watobo/core/client_cert_store.rb +106 -35
  25. data/lib/watobo/core/conversation.rb +48 -57
  26. data/lib/watobo/core/cookie.rb +23 -32
  27. data/lib/watobo/core/egress_handlers.rb +98 -0
  28. data/lib/watobo/core/finding.rb +66 -75
  29. data/lib/watobo/core/findings.rb +107 -114
  30. data/lib/watobo/core/forwarding_proxy.rb +13 -22
  31. data/lib/watobo/core/fuzz_gen.rb +0 -9
  32. data/lib/watobo/core/intercept_carver.rb +166 -177
  33. data/lib/watobo/core/intercept_filter.rb +235 -244
  34. data/lib/watobo/core/interceptor.rb +98 -107
  35. data/lib/watobo/core/min_class.rb +4 -13
  36. data/lib/watobo/core/netfilter_queue.rb +170 -179
  37. data/lib/watobo/core/ott_cache.rb +132 -141
  38. data/lib/watobo/core/parameter.rb +43 -52
  39. data/lib/watobo/core/passive_check.rb +103 -102
  40. data/lib/watobo/core/passive_checks.rb +48 -57
  41. data/lib/watobo/core/passive_scanner.rb +54 -55
  42. data/lib/watobo/core/plugin.rb +11 -20
  43. data/lib/watobo/core/project.rb +3 -9
  44. data/lib/watobo/core/proxy.rb +43 -52
  45. data/lib/watobo/core/request.rb +125 -123
  46. data/lib/watobo/core/response.rb +44 -53
  47. data/lib/watobo/core/scanner.rb +0 -9
  48. data/lib/watobo/core/scanner3.rb +405 -414
  49. data/lib/watobo/core/scope.rb +83 -92
  50. data/lib/watobo/core/session.rb +1043 -1026
  51. data/lib/watobo/core/sid_cache.rb +98 -107
  52. data/lib/watobo/core/subscriber.rb +25 -34
  53. data/lib/watobo/defaults.rb +21 -30
  54. data/lib/watobo/external/diff/lcs.rb +0 -9
  55. data/lib/watobo/external/diff/lcs/array.rb +0 -9
  56. data/lib/watobo/external/diff/lcs/block.rb +0 -9
  57. data/lib/watobo/external/diff/lcs/callbacks.rb +0 -9
  58. data/lib/watobo/external/diff/lcs/change.rb +0 -9
  59. data/lib/watobo/external/diff/lcs/hunk.rb +0 -9
  60. data/lib/watobo/external/diff/lcs/ldiff.rb +0 -9
  61. data/lib/watobo/external/diff/lcs/string.rb +0 -9
  62. data/lib/watobo/externals.rb +6 -15
  63. data/lib/watobo/framework.rb +4 -13
  64. data/lib/watobo/framework/create_project.rb +60 -69
  65. data/lib/watobo/framework/init.rb +0 -9
  66. data/lib/watobo/framework/init_modules.rb +0 -9
  67. data/lib/watobo/framework/license_text.rb +28 -37
  68. data/lib/watobo/framework/load_chat.rb +13 -22
  69. data/lib/watobo/gui.rb +132 -123
  70. data/lib/watobo/gui/about_watobo.rb +0 -9
  71. data/lib/watobo/gui/browser_preview.rb +0 -9
  72. data/lib/watobo/gui/certificate_dialog.rb +0 -9
  73. data/lib/watobo/gui/chat_diff.rb +0 -9
  74. data/lib/watobo/gui/chatviewer_frame.rb +73 -72
  75. data/lib/watobo/gui/checkboxtree.rb +0 -9
  76. data/lib/watobo/gui/checks_policy_frame.rb +0 -9
  77. data/lib/watobo/gui/client_cert_dialog.rb +96 -87
  78. data/lib/watobo/gui/confirm_scan_dialog.rb +0 -9
  79. data/lib/watobo/gui/conversation_table.rb +158 -164
  80. data/lib/watobo/gui/conversation_table_ctrl.rb +207 -216
  81. data/lib/watobo/gui/conversation_table_ctrl2.rb +373 -382
  82. data/lib/watobo/gui/csrf_token_dialog.rb +0 -9
  83. data/lib/watobo/gui/custom_viewer.rb +374 -383
  84. data/lib/watobo/gui/dashboard.rb +296 -303
  85. data/lib/watobo/gui/define_scope_frame.rb +0 -9
  86. data/lib/watobo/gui/differ_frame.rb +215 -224
  87. data/lib/watobo/gui/edit_comment.rb +0 -9
  88. data/lib/watobo/gui/edit_scope_dialog.rb +0 -9
  89. data/lib/watobo/gui/export_dialog.rb +104 -113
  90. data/lib/watobo/gui/finding_info.rb +0 -9
  91. data/lib/watobo/gui/findings_tree.rb +210 -217
  92. data/lib/watobo/gui/full_scan_dialog.rb +0 -9
  93. data/lib/watobo/gui/fuzzer_gui.rb +1295 -1313
  94. data/lib/watobo/gui/fxsave_thread.rb +14 -0
  95. data/lib/watobo/gui/goto_url_dialog.rb +70 -79
  96. data/lib/watobo/gui/hex_viewer.rb +0 -9
  97. data/lib/watobo/gui/html_viewer.rb +287 -296
  98. data/lib/watobo/gui/intercept_filter_dialog.rb +188 -197
  99. data/lib/watobo/gui/interceptor_gui.rb +1041 -1051
  100. data/lib/watobo/gui/interceptor_settings_dialog.rb +0 -9
  101. data/lib/watobo/gui/json_viewer.rb +287 -0
  102. data/lib/watobo/gui/list_box.rb +101 -110
  103. data/lib/watobo/gui/log_file_viewer.rb +32 -41
  104. data/lib/watobo/gui/log_viewer.rb +83 -88
  105. data/lib/watobo/gui/login_wizzard.rb +0 -9
  106. data/lib/watobo/gui/main_window.rb +587 -618
  107. data/lib/watobo/gui/manual_request_editor.rb +620 -565
  108. data/lib/watobo/gui/master_pw_dialog.rb +0 -9
  109. data/lib/watobo/gui/mixins/gui_settings.rb +29 -38
  110. data/lib/watobo/gui/page_tree.rb +217 -226
  111. data/lib/watobo/gui/password_policy_dialog.rb +0 -9
  112. data/lib/watobo/gui/plugin_board.rb +0 -9
  113. data/lib/watobo/gui/preferences_dialog.rb +0 -9
  114. data/lib/watobo/gui/progress_window.rb +17 -27
  115. data/lib/watobo/gui/project_wizzard.rb +0 -9
  116. data/lib/watobo/gui/proxy_dialog.rb +1 -10
  117. data/lib/watobo/gui/quick_scan_dialog.rb +0 -9
  118. data/lib/watobo/gui/request_builder_frame.rb +102 -111
  119. data/lib/watobo/gui/request_editor.rb +181 -137
  120. data/lib/watobo/gui/rewrite_filters_dialog.rb +394 -403
  121. data/lib/watobo/gui/rewrite_rules_dialog.rb +372 -381
  122. data/lib/watobo/gui/save_chat_dialog.rb +140 -149
  123. data/lib/watobo/gui/scanner_settings_dialog.rb +0 -9
  124. data/lib/watobo/gui/select_chat_dialog.rb +0 -9
  125. data/lib/watobo/gui/session_management_dialog.rb +0 -9
  126. data/lib/watobo/gui/sites_tree.rb +0 -9
  127. data/lib/watobo/gui/status_bar.rb +0 -9
  128. data/lib/watobo/gui/table_editor.rb +0 -9
  129. data/lib/watobo/gui/tagless_viewer.rb +0 -9
  130. data/lib/watobo/gui/templates/plugin.rb +0 -9
  131. data/lib/watobo/gui/templates/plugin2.rb +92 -100
  132. data/lib/watobo/gui/templates/plugin_base.rb +144 -153
  133. data/lib/watobo/gui/text_viewer.rb +0 -9
  134. data/lib/watobo/gui/transcoder_window.rb +0 -9
  135. data/lib/watobo/gui/utils/gui_utils.rb +0 -9
  136. data/lib/watobo/gui/utils/init_icons.rb +86 -95
  137. data/lib/watobo/gui/utils/load_icons.rb +33 -42
  138. data/lib/watobo/gui/utils/load_plugins.rb +116 -119
  139. data/lib/watobo/gui/utils/master_password.rb +68 -77
  140. data/lib/watobo/gui/utils/save_default_settings.rb +113 -122
  141. data/lib/watobo/gui/utils/save_project_settings.rb +0 -9
  142. data/lib/watobo/gui/utils/save_proxy_settings.rb +41 -50
  143. data/lib/watobo/gui/utils/save_scanner_settings.rb +18 -27
  144. data/lib/watobo/gui/utils/session_history.rb +112 -121
  145. data/lib/watobo/gui/workspace_dialog.rb +0 -9
  146. data/lib/watobo/gui/www_auth_dialog.rb +0 -9
  147. data/lib/watobo/gui/xml_viewer_frame.rb +0 -9
  148. data/lib/watobo/http.rb +4 -13
  149. data/lib/watobo/http/cookies/cookies.rb +26 -35
  150. data/lib/watobo/http/data/data.rb +45 -54
  151. data/lib/watobo/http/data/json.rb +47 -55
  152. data/lib/watobo/http/url/url.rb +38 -47
  153. data/lib/watobo/http/xml/xml.rb +124 -130
  154. data/lib/watobo/interceptor.rb +3 -12
  155. data/lib/watobo/interceptor/proxy.rb +742 -739
  156. data/lib/watobo/interceptor/transparent.rb +22 -24
  157. data/lib/watobo/mixins.rb +10 -19
  158. data/lib/watobo/mixins/check_info.rb +27 -36
  159. data/lib/watobo/mixins/httpparser.rb +613 -637
  160. data/lib/watobo/mixins/request_parser.rb +88 -97
  161. data/lib/watobo/mixins/shapers.rb +515 -529
  162. data/lib/watobo/mixins/transcoders.rb +3 -11
  163. data/lib/watobo/parser.rb +1 -10
  164. data/lib/watobo/parser/html.rb +83 -92
  165. data/lib/watobo/patch_fxruby_setfocus.rb +26 -0
  166. data/lib/watobo/sockets.rb +3 -12
  167. data/lib/watobo/sockets/agent.rb +828 -837
  168. data/lib/watobo/sockets/client_socket.rb +308 -312
  169. data/lib/watobo/sockets/connection.rb +401 -410
  170. data/lib/watobo/sockets/http_socket.rb +11 -13
  171. data/lib/watobo/sockets/ntlm_auth.rb +129 -138
  172. data/lib/watobo/utils.rb +10 -19
  173. data/lib/watobo/utils/check_regex.rb +0 -9
  174. data/lib/watobo/utils/copy_object.rb +0 -9
  175. data/lib/watobo/utils/crypto.rb +0 -9
  176. data/lib/watobo/utils/expand_range.rb +23 -32
  177. data/lib/watobo/utils/export_xml.rb +97 -106
  178. data/lib/watobo/utils/file_management.rb +9 -11
  179. data/lib/watobo/utils/hexprint.rb +9 -18
  180. data/lib/watobo/utils/load_chat.rb +0 -9
  181. data/lib/watobo/utils/load_icon.rb +0 -9
  182. data/lib/watobo/utils/ntlm.rb +866 -875
  183. data/lib/watobo/utils/print_debug.rb +12 -21
  184. data/lib/watobo/utils/response_builder.rb +90 -99
  185. data/lib/watobo/utils/response_hash.rb +0 -9
  186. data/lib/watobo/utils/secure_eval.rb +0 -9
  187. data/lib/watobo/utils/strings.rb +10 -19
  188. data/lib/watobo/utils/text2request.rb +0 -9
  189. data/lib/watobo/utils/url.rb +23 -32
  190. data/lib/watobo/utils/utf16.rb +11 -20
  191. data/modules/active/Apache/mod_status.rb +0 -9
  192. data/modules/active/Apache/multiview.rb +151 -160
  193. data/modules/active/Flash/crossdomain.rb +0 -9
  194. data/modules/active/JWT/jwt_oauth2_none.rb +111 -0
  195. data/modules/active/cq5/cq5_default_selectors.rb +106 -115
  196. data/modules/active/cq5/cqp_user_enumeration.rb +125 -134
  197. data/modules/active/directories/dirwalker.rb +0 -9
  198. data/modules/active/discovery/fileextensions.rb +0 -9
  199. data/modules/active/discovery/http_methods.rb +0 -9
  200. data/modules/active/discovery/jsmapfiles.rb +79 -0
  201. data/modules/active/domino/domino_db.rb +68 -76
  202. data/modules/active/dotNET/custom_errors.rb +102 -111
  203. data/modules/active/dotNET/dotnet_files.rb +90 -99
  204. data/modules/active/fileinclusion/lfi_simple.rb +0 -9
  205. data/modules/active/jboss/jboss_basic.rb +0 -9
  206. data/modules/active/sap/business_objects.rb +51 -60
  207. data/modules/active/sap/its_commands.rb +0 -9
  208. data/modules/active/sap/its_service_parameter.rb +0 -9
  209. data/modules/active/sap/its_services.rb +0 -9
  210. data/modules/active/sap/its_xss.rb +0 -9
  211. data/modules/active/shell_shock/shell_shock.rb +139 -148
  212. data/modules/active/siebel/siebel_apps.rb +160 -169
  213. data/modules/active/sqlinjection/sql_boolean.rb +0 -9
  214. data/modules/active/sqlinjection/sql_numerical.rb +198 -0
  215. data/modules/active/sqlinjection/sqli_error.rb +0 -9
  216. data/modules/active/sqlinjection/sqli_timing.rb +220 -229
  217. data/modules/active/struts2/default_handler_ognl.rb +106 -115
  218. data/modules/active/struts2/include_params_ognl.rb +105 -114
  219. data/modules/active/xml/xml_xxe.rb +112 -123
  220. data/modules/active/xss/xss_ng.rb +214 -223
  221. data/modules/active/xss/xss_simple.rb +0 -9
  222. data/modules/passive/ajax.rb +68 -77
  223. data/modules/passive/autocomplete.rb +56 -65
  224. data/modules/passive/cookie_options.rb +0 -9
  225. data/modules/passive/cookie_xss.rb +0 -9
  226. data/modules/passive/detect_code.rb +0 -9
  227. data/modules/passive/detect_fileupload.rb +0 -9
  228. data/modules/passive/detect_infrastructure.rb +0 -9
  229. data/modules/passive/detect_one_time_tokens.rb +0 -9
  230. data/modules/passive/dirindexing.rb +0 -9
  231. data/modules/passive/disclosure_domino.rb +55 -64
  232. data/modules/passive/disclosure_emails.rb +0 -9
  233. data/modules/passive/disclosure_ipaddr.rb +55 -53
  234. data/modules/passive/filename_as_parameter.rb +0 -9
  235. data/modules/passive/form_spotter.rb +0 -9
  236. data/modules/passive/hidden_fields.rb +50 -59
  237. data/modules/passive/hotspots.rb +0 -9
  238. data/modules/passive/in_script_parameter.rb +0 -9
  239. data/modules/passive/json_web_token.rb +93 -0
  240. data/modules/passive/multiple_server_headers.rb +0 -9
  241. data/modules/passive/possible_login.rb +0 -9
  242. data/modules/passive/redirect_url.rb +0 -9
  243. data/modules/passive/redirectionz.rb +0 -9
  244. data/modules/passive/sap-headers.rb +56 -65
  245. data/modules/passive/xss_dom.rb +0 -9
  246. data/plugins/aem/aem.rb +11 -20
  247. data/plugins/aem/gui/main.rb +118 -127
  248. data/plugins/aem/gui/tree_view.rb +171 -180
  249. data/plugins/aem/lib/agent.rb +130 -138
  250. data/plugins/aem/lib/dispatcher.rb +45 -51
  251. data/plugins/aem/lib/engine.rb +177 -186
  252. data/plugins/catalog/catalog.rb +345 -355
  253. data/plugins/crawler/crawler.rb +4 -13
  254. data/plugins/crawler/gui.rb +5 -14
  255. data/plugins/crawler/gui/auth_frame.rb +270 -279
  256. data/plugins/crawler/gui/crawler_gui.rb +271 -276
  257. data/plugins/crawler/gui/general_settings_frame.rb +96 -105
  258. data/plugins/crawler/gui/hooks_frame.rb +80 -89
  259. data/plugins/crawler/gui/scope_frame.rb +50 -59
  260. data/plugins/crawler/gui/settings_tabbook.rb +38 -47
  261. data/plugins/crawler/gui/status_frame.rb +59 -68
  262. data/plugins/crawler/lib/bags.rb +18 -27
  263. data/plugins/crawler/lib/constants.rb +11 -20
  264. data/plugins/crawler/lib/engine.rb +488 -497
  265. data/plugins/crawler/lib/grabber.rb +68 -77
  266. data/plugins/crawler/lib/status.rb +71 -80
  267. data/plugins/crawler/lib/uri_mp.rb +12 -21
  268. data/plugins/filefinder/filefinder.rb +326 -333
  269. data/plugins/sqlmap/bin/test.rb +78 -87
  270. data/plugins/sqlmap/gui.rb +4 -13
  271. data/plugins/sqlmap/gui/main.rb +218 -227
  272. data/plugins/sqlmap/gui/options_frame.rb +97 -106
  273. data/plugins/sqlmap/lib/sqlmap_ctrl.rb +90 -100
  274. data/plugins/sqlmap/sqlmap.rb +2 -11
  275. data/plugins/sslchecker/cli/sslchecker_cli.rb +0 -9
  276. data/plugins/sslchecker/gui/cipher_table.rb +246 -254
  277. data/plugins/sslchecker/gui/gui.rb +258 -264
  278. data/plugins/sslchecker/gui/sslchecker.rb +4 -13
  279. data/plugins/sslchecker/lib/check.rb +127 -133
  280. data/plugins/wshell/gui/main.rb +119 -117
  281. data/plugins/wshell/lib/core.rb +38 -88
  282. data/plugins/wshell/wshell.rb +11 -20
  283. metadata +170 -164
@@ -1,49 +1,40 @@
1
- #.
2
- # settings_tabbook.rb
3
- #.
4
- # Copyright 2014 by siberas, http://www.siberas.de
5
- # This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
6
- # WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
7
- # WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
8
- # You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
9
-
10
1
  # @private
11
- module Watobo#:nodoc: all
12
- module Plugin
13
- module Crawler
14
- class Gui
15
- class SettingsTabBook < FXTabBook
16
- attr :hooks, :general, :log_viewer, :auth, :scope
17
-
18
-
19
-
20
- def initialize(owner)
21
- #@tab = FXTabBook.new(self, nil, 0, LAYOUT_FILL_X|LAYOUT_FILL_Y|LAYOUT_RIGHT)
22
- super(owner, nil, 0, LAYOUT_FILL_X|LAYOUT_FILL_Y|LAYOUT_RIGHT)
23
- FXTabItem.new(self, "General", nil)
24
- # frame = FXVerticalFrame.new(self, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_RAISED)
25
- @general = GeneralSettingsFrame.new(self)
26
-
27
- FXTabItem.new(self, "Scope", nil)
28
- @scope = ScopeFrame.new(self)
29
-
30
- FXTabItem.new(self, "Auth", nil)
31
- @auth = AuthFrame.new(self)
32
-
33
-
34
- FXTabItem.new(self, "Hooks", nil)
35
- @hooks = HooksFrame.new(self)
36
-
37
- FXTabItem.new(self, "Log", nil)
38
- frame = FXVerticalFrame.new(self, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_THICK|FRAME_RAISED)
39
- @log_viewer = Watobo::Gui::LogViewer.new(frame, :append, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_SUNKEN)
40
-
41
- self.connect(SEL_COMMAND){
42
- @hooks.selected if self.current == 3
43
- }
44
- end
45
- end
46
- end
47
- end
48
- end
2
+ module Watobo#:nodoc: all
3
+ module Plugin
4
+ module Crawler
5
+ class Gui
6
+ class SettingsTabBook < FXTabBook
7
+ attr :hooks, :general, :log_viewer, :auth, :scope
8
+
9
+
10
+
11
+ def initialize(owner)
12
+ #@tab = FXTabBook.new(self, nil, 0, LAYOUT_FILL_X|LAYOUT_FILL_Y|LAYOUT_RIGHT)
13
+ super(owner, nil, 0, LAYOUT_FILL_X|LAYOUT_FILL_Y|LAYOUT_RIGHT)
14
+ FXTabItem.new(self, "General", nil)
15
+ # frame = FXVerticalFrame.new(self, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_RAISED)
16
+ @general = GeneralSettingsFrame.new(self)
17
+
18
+ FXTabItem.new(self, "Scope", nil)
19
+ @scope = ScopeFrame.new(self)
20
+
21
+ FXTabItem.new(self, "Auth", nil)
22
+ @auth = AuthFrame.new(self)
23
+
24
+
25
+ FXTabItem.new(self, "Hooks", nil)
26
+ @hooks = HooksFrame.new(self)
27
+
28
+ FXTabItem.new(self, "Log", nil)
29
+ frame = FXVerticalFrame.new(self, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_THICK|FRAME_RAISED)
30
+ @log_viewer = Watobo::Gui::LogViewer.new(frame, :append, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_SUNKEN)
31
+
32
+ self.connect(SEL_COMMAND){
33
+ @hooks.selected if self.current == 3
34
+ }
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
49
40
  end
@@ -1,71 +1,62 @@
1
- #.
2
- # status_frame.rb
3
- #.
4
- # Copyright 2014 by siberas, http://www.siberas.de
5
- # This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
6
- # WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
7
- # WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
8
- # You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
9
-
10
1
  # @private
11
- module Watobo#:nodoc: all
12
- module Plugin
13
- module Crawler
14
- class Gui
15
- class StatusFrame < FXHorizontalFrame
16
-
17
- include Watobo::Plugin::Crawler::Constants
18
- # :engine_status => CRAWL_NONE,
19
- # :page_size => 0,
20
- # :link_size => 0,
21
- # :skipped_domains => 0
2
+ module Watobo#:nodoc: all
3
+ module Plugin
4
+ module Crawler
5
+ class Gui
6
+ class StatusFrame < FXHorizontalFrame
7
+
8
+ include Watobo::Plugin::Crawler::Constants
9
+ # :engine_status => CRAWL_NONE,
10
+ # :page_size => 0,
11
+ # :link_size => 0,
12
+ # :skipped_domains => 0
22
13
  def update_status(status)
23
- #puts status.to_yaml
24
- if status.has_key? :engine_status
25
- case status[:engine_status]
26
- when CRAWL_NONE
27
- self.backColor = self.parent.backColor
28
- @status_txt.text = "Status: Idle"
29
- when CRAWL_RUNNING
30
- self.backColor = FXColor::Red
31
- @status_txt.text = "Status: Running"
32
-
33
- when CRAWL_PAUSED
34
- self.backColor = FXColor::Yellow
35
- @status_txt.text = "Status: Paused"
36
- end
37
- end
38
-
39
- if status.has_key? :link_size
40
- @link_size_txt.text = "Links: #{status[:link_size]}"
41
- end
42
-
43
- if status.has_key? :page_size
44
- @page_size_txt.text = "Pages: #{status[:page_size]}"
45
- end
46
-
47
- if status.has_key? :total_requests
48
- @requests_txt.text = "Requests: #{status[:total_requests]}"
49
- end
50
- end
51
-
52
- def initialize(owner)
53
- super(owner, :opts => LAYOUT_FILL_X|FRAME_RAISED)
54
- @info_fields = []
55
- #frame = FXHorizontalFrame.new(, :opts => LAYOUT_FILL_Y, :padding => 0)
56
- frame = self
57
- @info_fields << ( @status_txt = FXLabel.new(frame, "Status: Stopped", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 100) )
58
- @info_fields << (@link_size_txt = FXLabel.new(frame, "Links: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 70) )
59
- @info_fields << (@page_size_txt = FXLabel.new(frame, "Pages: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 70) )
60
- @info_fields << (@requests_txt = FXLabel.new(frame, "Requests: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 100) )
61
-
62
- @info_fields.each do |i|
63
- i.justify = JUSTIFY_LEFT
64
- end
65
- end
66
-
67
- end
68
- end
69
- end
70
- end
14
+ #puts status.to_yaml
15
+ if status.has_key? :engine_status
16
+ case status[:engine_status]
17
+ when CRAWL_NONE
18
+ self.backColor = self.parent.backColor
19
+ @status_txt.text = "Status: Idle"
20
+ when CRAWL_RUNNING
21
+ self.backColor = FXColor::Red
22
+ @status_txt.text = "Status: Running"
23
+
24
+ when CRAWL_PAUSED
25
+ self.backColor = FXColor::Yellow
26
+ @status_txt.text = "Status: Paused"
27
+ end
28
+ end
29
+
30
+ if status.has_key? :link_size
31
+ @link_size_txt.text = "Links: #{status[:link_size]}"
32
+ end
33
+
34
+ if status.has_key? :page_size
35
+ @page_size_txt.text = "Pages: #{status[:page_size]}"
36
+ end
37
+
38
+ if status.has_key? :total_requests
39
+ @requests_txt.text = "Requests: #{status[:total_requests]}"
40
+ end
41
+ end
42
+
43
+ def initialize(owner)
44
+ super(owner, :opts => LAYOUT_FILL_X|FRAME_RAISED)
45
+ @info_fields = []
46
+ #frame = FXHorizontalFrame.new(, :opts => LAYOUT_FILL_Y, :padding => 0)
47
+ frame = self
48
+ @info_fields << ( @status_txt = FXLabel.new(frame, "Status: Stopped", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 100) )
49
+ @info_fields << (@link_size_txt = FXLabel.new(frame, "Links: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 70) )
50
+ @info_fields << (@page_size_txt = FXLabel.new(frame, "Pages: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 70) )
51
+ @info_fields << (@requests_txt = FXLabel.new(frame, "Requests: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 100) )
52
+
53
+ @info_fields.each do |i|
54
+ i.justify = JUSTIFY_LEFT
55
+ end
56
+ end
57
+
58
+ end
59
+ end
60
+ end
61
+ end
71
62
  end
@@ -1,29 +1,20 @@
1
- #.
2
- # bags.rb
3
- #.
4
- # Copyright 2014 by siberas, http://www.siberas.de
5
- # This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
6
- # WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
7
- # WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
8
- # You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
9
-
10
1
  # @private
11
- module Watobo#:nodoc: all
12
- module Crawler
13
- class PageBag
14
- attr :page, :depth
15
- def initialize(page, depth)
16
- @page = page
17
- @depth = depth
18
- end
19
- end
20
-
21
- class LinkBag
22
- attr :link, :depth
23
- def initialize(link, depth)
24
- @link = link
25
- @depth = depth
26
- end
27
- end
28
- end
2
+ module Watobo#:nodoc: all
3
+ module Crawler
4
+ class PageBag
5
+ attr :page, :depth
6
+ def initialize(page, depth)
7
+ @page = page
8
+ @depth = depth
9
+ end
10
+ end
11
+
12
+ class LinkBag
13
+ attr :link, :depth
14
+ def initialize(link, depth)
15
+ @link = link
16
+ @depth = depth
17
+ end
18
+ end
19
+ end
29
20
  end
@@ -1,22 +1,13 @@
1
- #.
2
- # constants.rb
3
- #.
4
- # Copyright 2014 by siberas, http://www.siberas.de
5
- # This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
6
- # WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
7
- # WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
8
- # You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
9
-
10
1
  # @private
11
- module Watobo#:nodoc: all
12
- module Plugin
13
- module Crawler
14
- module Constants
15
- CRAWL_NONE = 0x00
16
- CRAWL_RUNNING = 0x01
17
- CRAWL_PAUSED = 0x02
18
-
19
- end
20
- end
21
- end
2
+ module Watobo#:nodoc: all
3
+ module Plugin
4
+ module Crawler
5
+ module Constants
6
+ CRAWL_NONE = 0x00
7
+ CRAWL_RUNNING = 0x01
8
+ CRAWL_PAUSED = 0x02
9
+
10
+ end
11
+ end
12
+ end
22
13
  end
@@ -1,517 +1,508 @@
1
- #.
2
- # engine.rb
3
- #.
4
- # Copyright 2014 by siberas, http://www.siberas.de
5
- # This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
6
- # WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
7
- # WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
8
- # You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
9
-
10
1
  # @private
11
- module Watobo#:nodoc: all
12
- module Crawler
13
-
14
- class Agent < Mechanize
15
-
16
- def initialize(opts)
17
- super()
18
-
19
-
20
- self.verify_mode = OpenSSL::SSL::VERIFY_NONE
21
- self.ignore_bad_chunking = true
22
- self.keep_alive = false
23
-
24
- self.user_agent = opts[:user_agent] if opts.has_key?(:user_agent)
25
-
26
- if opts.has_key? :username and opts.has_key? :password
27
- unless opts[:username].empty? and opts[:password].empty?
28
-
29
- user = opts[:username]
30
- pw = opts[:password]
31
- uri = opts[:auth_uri]
32
- # puts "Got Credentials for #{uri}: #{user} / #{pw}"
33
- self.add_auth(uri, user , pw )
34
- # TODO: remove this workaround for a Mechanize Bug (#243)
35
- p = self.get uri
36
- end
37
- end
38
-
39
- if ( opts.has_key? :proxy_host ) && ( opts.has_key? :proxy_port )
40
- self.set_proxy( opts[:proxy_host], opts[:proxy_port] )
41
- end
42
-
43
- if opts.has_key? :pre_connect_hook
44
- self.pre_connect_hooks << opts[:pre_connect_hook] if opts[:pre_connect_hook].respond_to? :call
45
- end
46
-
47
- unless opts[:cookie_jar].nil?
48
- clean_jar = Mechanize::CookieJar.new
49
- opts[:cookie_jar].each{ |cookie|
50
- clean_jar.add! cookie
51
- }
52
- self.cookie_jar = clean_jar
53
- end
54
-
55
- end
56
-
57
- end
58
-
59
- class Engine
60
- include Watobo::Plugin::Crawler::Constants
61
-
62
- def subscribe(event, &callback)
63
- (@event_dispatcher_listeners[event] ||= []) << callback
64
- end
65
-
66
- def clearEvents(event)
67
- @event_dispatcher_listeners[event] ||= []
68
- @event_dispatcher_listeners[event].clear
69
- end
70
-
71
- def notify(event, *args)
72
- if @event_dispatcher_listeners[event]
73
- # puts "NOTIFY: #{self}(:#{event}) [#{@event_dispatcher_listeners[event].length}]" if $DEBUG
74
- @event_dispatcher_listeners[event].each do |m|
75
- m.call(*args) if m.respond_to? :call
76
- end
77
- end
78
- end
79
-
80
- def settings
81
- @opts
82
- end
83
-
84
-
85
-
86
- def get_page(url, opts={})
87
- ro = {}.update @opts
88
- ro.update opts
89
- agent = Crawler::Agent.new(ro)
90
- page = nil
91
- page = agent.get url
92
- return agent, page
93
- end
94
-
95
- def initialize(opts={})
96
- @event_dispatcher_listeners = Hash.new
97
- @status_lock = Mutex.new
98
-
99
- @opts = {
100
- :submit_forms => true,
101
- :max_depth => 5,
102
- :max_repeat => 20,
103
- :max_threads => 4,
104
- :user_agent => "watobo-crawler",
105
- :proxy_host => '127.0.0.1',
106
- :proxy_port => Watobo::Conf::Interceptor.port,
107
- :delay => 0,
108
- :head_request_pattern => '(pdf|swf|doc|flv|jpg|png|gif)',
109
- :allowed_hosts => [], # regex's
110
- :allowed_urls => [], # regex's
111
- :excluded_urls => ["logout"], # regex's
112
- :excluded_fields => ["userid","username","password"], # regex's'
113
- :excluded_form_names => [], # regex's'
114
- :root_path => "", # regex
115
- :username => "",
116
- :password => "",
117
- :auth_uri => nil,
118
- :auth_domain => "", # for ntlm auth
119
- :cookie_jar => nil
120
- }
121
-
122
- @opts.update opts
123
- @opts[:head_request_pattern] = '' if @opts[:head_request_pattern].nil?
124
-
125
- @stats = {
126
- :total_requests => 0
127
- }
128
-
129
- @link_keys = Hash.new
130
- @link_counts = Hash.new
131
-
132
- @form_keys = Hash.new
133
- @form_counts = Hash.new
134
-
135
- end
136
-
137
- def pause
138
- false
139
- end
140
-
141
- def cancel
142
- puts "[CRAWLER] - CANCEL!!"
143
- #@status_lock.synchronize do
144
- # @engine_status = CRAWL_NONE
2
+ module Watobo#:nodoc: all
3
+ module Crawler
4
+
5
+ class Agent < Mechanize
6
+
7
+ def initialize(opts)
8
+ super()
9
+
10
+
11
+ self.verify_mode = OpenSSL::SSL::VERIFY_NONE
12
+ self.ignore_bad_chunking = true
13
+ self.keep_alive = false
14
+
15
+ self.user_agent = opts[:user_agent] if opts.has_key?(:user_agent)
16
+
17
+ if opts.has_key? :username and opts.has_key? :password
18
+ unless opts[:username].empty? and opts[:password].empty?
19
+
20
+ user = opts[:username]
21
+ pw = opts[:password]
22
+ uri = opts[:auth_uri]
23
+ # puts "Got Credentials for #{uri}: #{user} / #{pw}"
24
+ self.add_auth(uri, user , pw )
25
+ # TODO: remove this workaround for a Mechanize Bug (#243)
26
+ p = self.get uri
27
+ end
28
+ end
29
+
30
+ if ( opts.has_key? :proxy_host ) && ( opts.has_key? :proxy_port )
31
+ self.set_proxy( opts[:proxy_host], opts[:proxy_port] )
32
+ end
33
+
34
+ if opts.has_key? :pre_connect_hook
35
+ self.pre_connect_hooks << opts[:pre_connect_hook] if opts[:pre_connect_hook].respond_to? :call
36
+ end
37
+
38
+ unless opts[:cookie_jar].nil?
39
+ clean_jar = Mechanize::CookieJar.new
40
+ opts[:cookie_jar].each{ |cookie|
41
+ clean_jar.add! cookie
42
+ }
43
+ self.cookie_jar = clean_jar
44
+ end
45
+
46
+ end
47
+
48
+ end
49
+
50
+ class Engine
51
+ include Watobo::Plugin::Crawler::Constants
52
+
53
+ def subscribe(event, &callback)
54
+ (@event_dispatcher_listeners[event] ||= []) << callback
55
+ end
56
+
57
+ def clearEvents(event)
58
+ @event_dispatcher_listeners[event] ||= []
59
+ @event_dispatcher_listeners[event].clear
60
+ end
61
+
62
+ def notify(event, *args)
63
+ if @event_dispatcher_listeners[event]
64
+ # puts "NOTIFY: #{self}(:#{event}) [#{@event_dispatcher_listeners[event].length}]" if $DEBUG
65
+ @event_dispatcher_listeners[event].each do |m|
66
+ m.call(*args) if m.respond_to? :call
67
+ end
68
+ end
69
+ end
70
+
71
+ def settings
72
+ @opts
73
+ end
74
+
75
+
76
+
77
+ def get_page(url, opts={})
78
+ ro = {}.update @opts
79
+ ro.update opts
80
+ agent = Crawler::Agent.new(ro)
81
+ page = nil
82
+ page = agent.get url
83
+ return agent, page
84
+ end
85
+
86
+ def initialize(opts={})
87
+ @event_dispatcher_listeners = Hash.new
88
+ @status_lock = Mutex.new
89
+
90
+ @opts = {
91
+ :submit_forms => true,
92
+ :max_depth => 5,
93
+ :max_repeat => 20,
94
+ :max_threads => 4,
95
+ :user_agent => "watobo-crawler",
96
+ :proxy_host => '127.0.0.1',
97
+ :proxy_port => Watobo::Conf::Interceptor.port,
98
+ :delay => 0,
99
+ :head_request_pattern => '(pdf|swf|doc|flv|jpg|png|gif)',
100
+ :allowed_hosts => [], # regex's
101
+ :allowed_urls => [], # regex's
102
+ :excluded_urls => ["logout"], # regex's
103
+ :excluded_fields => ["userid","username","password"], # regex's'
104
+ :excluded_form_names => [], # regex's'
105
+ :root_path => "", # regex
106
+ :username => "",
107
+ :password => "",
108
+ :auth_uri => nil,
109
+ :auth_domain => "", # for ntlm auth
110
+ :cookie_jar => nil
111
+ }
112
+
113
+ @opts.update opts
114
+ @opts[:head_request_pattern] = '' if @opts[:head_request_pattern].nil?
115
+
116
+ @stats = {
117
+ :total_requests => 0
118
+ }
119
+
120
+ @link_keys = Hash.new
121
+ @link_counts = Hash.new
122
+
123
+ @form_keys = Hash.new
124
+ @form_counts = Hash.new
125
+
126
+ end
127
+
128
+ def pause
129
+ false
130
+ end
131
+
132
+ def cancel
133
+ puts "[CRAWLER] - CANCEL!!"
134
+ #@status_lock.synchronize do
135
+ # @engine_status = CRAWL_NONE
145
136
  #end
146
- Watobo::Crawler::Status.engine = CRAWL_NONE
147
- @grabber_threads.each do |gt|
148
- puts "Killing Thread #{gt}"
149
- gt.kill
150
- gt.raise "CANCEL"
151
- end
152
- @grabber_threads.each{|t| t.join }
153
-
154
- @link_queue.clear
155
- @page_queue.clear
156
- @grabber_threads.clear
157
- @link_keys.clear
158
- @link_counts.clear
159
-
160
- @form_keys.clear
161
- @form_counts.clear
162
-
163
- #notify( :update_status, current_status )
164
- puts "CANCELED - CANCELED"
165
- # exit
166
- end
167
-
137
+ Watobo::Crawler::Status.engine = CRAWL_NONE
138
+ @grabber_threads.each do |gt|
139
+ puts "Killing Thread #{gt}"
140
+ gt.kill
141
+ gt.raise "CANCEL"
142
+ end
143
+ @grabber_threads.each{|t| t.join }
144
+
145
+ @link_queue.clear
146
+ @page_queue.clear
147
+ @grabber_threads.clear
148
+ @link_keys.clear
149
+ @link_counts.clear
150
+
151
+ @form_keys.clear
152
+ @form_counts.clear
153
+
154
+ #notify( :update_status, current_status )
155
+ puts "CANCELED - CANCELED"
156
+ # exit
157
+ end
158
+
168
159
  def run(url, opts={})
169
160
  #engine_status = CRAWL_RUNNING
170
161
  Watobo::Crawler::Status.reset
171
162
  Watobo::Crawler::Status.engine = CRAWL_RUNNING
172
-
173
- @opts.update opts
163
+
164
+ @opts.update opts
174
165
  @opts[:head_request_pattern] = '' if @opts[:head_request_pattern].nil?
175
166
 
176
167
  puts "crawler settings:"
177
168
  puts @opts.to_json
178
-
179
-
180
- @link_queue = Queue.new
181
- @page_queue = Queue.new
169
+
170
+
171
+ @link_queue = Queue.new
172
+ @page_queue = Queue.new
173
+
174
+ @link_keys = Hash.new
175
+ @link_counts = Hash.new
182
176
 
183
- @link_keys = Hash.new
184
- @link_counts = Hash.new
185
-
186
- @form_keys = Hash.new
187
- @form_counts = Hash.new
188
-
189
- @skipped_sites = Hash.new
190
-
191
- @grabber_threads = []
192
- start_link = URI.parse url
193
- return false if start_link.host.nil?
194
-
195
- allow_host(start_link)
196
-
197
- @link_queue.enq LinkBag.new(start_link, 0)
198
-
199
-
200
- notify(:log, "Crawling #{url} started ..." )
201
-
202
- @opts[:max_threads].times do |i|
203
- g = Grabber.new(@link_queue, @page_queue, @opts )
204
- @grabber_threads << g.run
205
- end
206
-
207
- puts "* startet #{@grabber_threads.length} grabbers"
208
-
209
- loop do
210
- pagebag = @page_queue.deq
211
-
212
- process_links(pagebag)
213
-
214
- process_forms(pagebag)
177
+ @form_keys = Hash.new
178
+ @form_counts = Hash.new
179
+
180
+ @skipped_sites = Hash.new
181
+
182
+ @grabber_threads = []
183
+ start_link = URI.parse url
184
+ return false if start_link.host.nil?
185
+
186
+ allow_host(start_link)
187
+
188
+ @link_queue.enq LinkBag.new(start_link, 0)
189
+
190
+
191
+ notify(:log, "Crawling #{url} started ..." )
192
+
193
+ @opts[:max_threads].times do |i|
194
+ g = Grabber.new(@link_queue, @page_queue, @opts )
195
+ @grabber_threads << g.run
196
+ end
197
+
198
+ puts "* startet #{@grabber_threads.length} grabbers"
199
+
200
+ loop do
201
+ pagebag = @page_queue.deq
202
+
203
+ process_links(pagebag)
204
+
205
+ process_forms(pagebag)
215
206
  #@stats[:total_requests] += 1 unless pagebag.nil?
216
207
  Watobo::Crawler::Status.inc_requests() unless pagebag.nil?
217
208
  Watobo::Crawler::Status.page_size= @page_queue.size
218
- Watobo::Crawler::Status.link_size= @link_queue.size
219
-
220
- puts "Links/Pages: #{@link_queue.size}/#{@page_queue.size}"
221
- #notify( :update_status, current_status )
222
- # if @link_queue.empty? and @page_queue.empty?
223
- if @page_queue.empty?
224
- # if page_queue is empty wait for all grabber threads finishing the link_queue
225
- until @link_queue.num_waiting == @grabber_threads.length
226
- Thread.pass
227
- end
228
- # when the link_queue is finished check the page_queue. Crawling is finished if page_queue is empty too.
229
- if @page_queue.empty?
230
- @grabber_threads.each { |t| t.kill }
231
- puts "Finished Crawling"
209
+ Watobo::Crawler::Status.link_size= @link_queue.size
210
+
211
+ puts "Links/Pages: #{@link_queue.size}/#{@page_queue.size}"
212
+ #notify( :update_status, current_status )
213
+ # if @link_queue.empty? and @page_queue.empty?
214
+ if @page_queue.empty?
215
+ # if page_queue is empty wait for all grabber threads finishing the link_queue
216
+ until @link_queue.num_waiting == @grabber_threads.length
217
+ Thread.pass
218
+ end
219
+ # when the link_queue is finished check the page_queue. Crawling is finished if page_queue is empty too.
220
+ if @page_queue.empty?
221
+ @grabber_threads.each { |t| t.kill }
222
+ puts "Finished Crawling"
232
223
  #@status_lock.synchronize{ @engine_status = CRAWL_NONE }
233
224
  Watobo::Crawler::Status.engine = CRAWL_NONE
234
-
235
- notify(:log, "Crawling finished")
236
- #notify( :update_status, current_status )
237
- break
238
-
239
- end
240
- end
241
-
242
- end
243
-
244
- end
245
-
246
- private
247
-
248
- def current_status
225
+
226
+ notify(:log, "Crawling finished")
227
+ #notify( :update_status, current_status )
228
+ break
229
+
230
+ end
231
+ end
232
+
233
+ end
234
+
235
+ end
236
+
237
+ private
238
+
239
+ def current_status
249
240
  {
250
- :engine_status => @engine_status,
251
- :link_size => @link_queue.size,
252
- :page_size => @page_queue.size
253
- }.update @stats
254
-
255
- end
256
-
257
-
258
- def allow_host(uri)
259
- if uri.is_a? URI
260
- site = uri.site.to_s
261
- # puts "Valid Site: #{site}"
262
- ah = allowed_hosts
263
- ah << site
264
- end
265
- end
266
-
267
- def process_forms(pagebag)
268
- return false unless pagebag.respond_to? :page
269
- page=pagebag.page
270
- return false unless page.respond_to? :forms
271
- page.forms.each do |f|
272
-
273
- action = page.uri.merge f.action unless f.action =~ /^http/
274
- f.action = action.to_s
275
-
276
- if send_form? f
277
- # puts "SUBMIT FORM: #{f.action}"
278
- send_form(f, pagebag.depth)
279
- end
280
- end
281
- end
282
-
283
- def process_links(pagebag)
284
- return false unless pagebag.respond_to? :page
285
- page = pagebag.page
286
- return false unless page.respond_to? :links
287
-
288
- page.links.each do |l|
289
- begin
241
+ :engine_status => @engine_status,
242
+ :link_size => @link_queue.size,
243
+ :page_size => @page_queue.size
244
+ }.update @stats
245
+
246
+ end
247
+
248
+
249
+ def allow_host(uri)
250
+ if uri.is_a? URI
251
+ site = uri.site.to_s
252
+ # puts "Valid Site: #{site}"
253
+ ah = allowed_hosts
254
+ ah << site
255
+ end
256
+ end
257
+
258
+ def process_forms(pagebag)
259
+ return false unless pagebag.respond_to? :page
260
+ page=pagebag.page
261
+ return false unless page.respond_to? :forms
262
+ page.forms.each do |f|
263
+
264
+ action = page.uri.merge f.action unless f.action =~ /^http/
265
+ f.action = action.to_s
266
+
267
+ if send_form? f
268
+ # puts "SUBMIT FORM: #{f.action}"
269
+ send_form(f, pagebag.depth)
270
+ end
271
+ end
272
+ end
273
+
274
+ def process_links(pagebag)
275
+ return false unless pagebag.respond_to? :page
276
+ page = pagebag.page
277
+ return false unless page.respond_to? :links
278
+
279
+ page.links.each do |l|
280
+ begin
290
281
  link = l
291
282
  next if l.href.nil?
292
-
293
- link = page.uri.merge l.uri unless l.href =~ /^http/
294
- # puts "FOLLOW LINK #{link} ?"
295
- if follow_link? link
296
- # puts ">> OK"
297
- submit_link(link, pagebag.depth)
298
- else
299
- # puts ">> NO"
300
- end
301
- rescue => bang
283
+
284
+ link = page.uri.merge l.uri unless l.href =~ /^http/
285
+ # puts "FOLLOW LINK #{link} ?"
286
+ if follow_link? link
287
+ # puts ">> OK"
288
+ submit_link(link, pagebag.depth)
289
+ else
290
+ # puts ">> NO"
291
+ end
292
+ rescue => bang
302
293
  puts bang
303
- puts bang.backtrace if $DEBUG
304
- end
305
- end
306
-
307
- end
308
-
309
-
310
- def submit_link(link, depth)
311
- # @link_keys[link_key(link)] = link
312
-
313
- clk = link_key(link, :clear_values => true)
314
- @link_counts[clk] ||= 0
315
- @link_counts[clk] += 1
316
- lk = link_key(link)
317
- return false if @link_keys.has_key? lk
318
- @link_keys[lk] = nil
319
- if @link_counts[clk] < @opts[:max_repeat]
320
- @link_queue.enq LinkBag.new(link, depth)
321
- else
322
- puts "! MAX REPEAT !\nSkipped link #{link}" if $DEBUG
323
- end
324
- end
325
-
326
- def form_key(form, opts={} )
327
- o = { :clear_values => false }
328
- o.update opts
329
-
330
- fp = "#{form.action}"
331
- fp << form.method
332
- if form.request_data =~ /=/
333
- data = form.request_data.split("&").sort.join("&")
334
- if o[:clear_values]
335
- fp << data.gsub(/=[^&]*/,'=')
336
- else
337
- fp << data
338
- end
339
- end
340
- fkey = Digest::MD5.hexdigest fp
341
- fkey
342
- end
343
-
344
- def send_form(form, depth)
345
- return false if @engine_status == CRAWL_NONE
346
- cfk = form_key(form, :clear_values => true)
347
- @form_counts[cfk] ||= 0
348
- @form_counts[cfk] += 1
349
-
350
- # @form_keys[form_key(form)] = form
351
- fk = form_key(form)
352
- return false if @form_keys.has_key? fk
353
- @form_keys[fk] = nil
354
- begin
355
- if @form_counts[cfk] < @opts[:max_repeat]
356
- if form.buttons.length > 0
357
- p = form.click_button
358
- else
359
- p = form.submit()
360
- end
361
- puts p.class
362
- @page_queue.enq PageBag.new(p, depth+1)
363
- else
364
- puts "! MAX REPEAT !\nSkipped Form #{form.action}"
365
- end
366
- rescue => bang
367
- puts bang
368
- puts bang.backtrace
369
- end
370
- end
371
-
372
- def send_form?(form)
373
- # puts "SEND FORM?"
374
- return false unless engine_running?
375
- return false unless @opts[:submit_forms] == true
376
- # puts "> submit_forms"
377
- return false unless allowed? form.action
378
- #puts "> allowed"
379
- return false unless fields_allowed? form
380
- #puts "> fields allowed"
381
- return false if form_sent? form
382
- # puts "> form not sent"
383
- return true
384
- end
385
-
386
- def follow_link?(link)
387
- return false unless allowed? link
388
- return false if link_is_followed? link
389
- return true
390
- end
391
-
392
- def host_allowed?(uri)
393
- #puts "ALLOWED HOSTS =>"
394
- #puts allowed_hosts
395
- #puts "---"
396
- # puts "Host Allowed?"
397
- ah = allowed_hosts
398
- # puts ah.class
399
- #puts ah
400
- return false if ah.empty?
401
- ahc = ah.select{ |h| uri.site =~ /^#{h}$/ }.length
402
- if ahc > 0
403
- # puts "> Host IS allowed!"
404
- return true
405
- end
406
- # puts "> Host is NOT allowed!"
407
- return false
408
- end
409
-
410
- def url_allowed?(uri)
411
- # puts "* excluded_urls"
412
- # puts exluded_urls
413
- return false if excluded_urls.select{ |url| uri.path =~ /#{url}/ }.length > 0
414
- # puts "* allowed_urls"
415
- # puts allowed_urls
416
- return true if allowed_urls.empty?
417
- return true if allowed_urls.select{ |url| uri.path =~ /#{url}/ }.length > 0
418
- # puts "> URL is NOT allowed"
419
- return false
420
- end
421
-
422
- def path_allowed?(uri)
423
- return true if root_path.nil?
424
- return true if root_path.empty?
425
- return true if uri.path =~ /^#{root_path}/
426
- # puts "> PATH is NOT ALLOWED"
427
- return false
428
- end
429
-
430
- def cleanup_uri(obj)
431
- uri = nil
432
- uri = obj.uri if obj.respond_to? :uri
433
- uri = URI.parse(obj) if obj.is_a? String
434
- uri = obj if obj.is_a? URI::HTTP
435
- uri
436
- end
437
-
438
- def allowed?(link)
439
- valid = false
440
- # need to handle different link objects, Mechanize::Page::Link and URIs
441
- uri = nil
442
- uri = link.uri if link.respond_to? :uri
443
- uri = URI.parse(link) if link.is_a? String
444
- uri = link if link.is_a? URI::HTTP
445
-
446
- return false if uri.nil?
447
-
448
- host_allowed?(uri) &&
449
- url_allowed?(uri) &&
450
- path_allowed?(uri)
451
- end
452
-
453
- def form_sent?(form)
454
-
455
- @form_keys.has_key? form_key(form)
456
- end
457
-
458
- def link_key(link, opts={})
459
- o = { :clear_values => false }
460
- o.update opts
461
-
462
- uri = cleanup_uri(link)
463
-
464
- query_sorted = ""
465
- query_sorted = uri.query.split("&").sort.join("&") unless uri.query.nil?
466
-
467
- key = ""
468
- key << uri.scheme
469
- key << uri.site
470
- key << uri.path
471
- key << query_sorted
472
- key.gsub!(/=[^&]*/,'=') if o[:clear_values] == true
473
-
474
- Digest::MD5.hexdigest key
475
- end
476
-
477
- def engine_running?
478
- @status_lock.synchronize do
479
- return false if @engine_status == CRAWL_NONE
480
- return true
481
- end
482
- end
483
-
484
- def link_is_followed?(link)
485
-
486
- return true if @link_keys.has_key? link_key(link)
487
-
488
- false
489
- end
490
-
491
- def fields_allowed?(form)
492
- form.fields.each do |f|
493
- excluded_fields.each do |ef|
494
- return false if f.name =~ /#{ef}/
495
- end
496
- end
497
- return true
498
- end
499
-
500
- def method_missing(name, *args, &block)
501
- # puts "* instance method missing (#{name})"
502
- if name =~ /(.*)=$/
503
- @opts.has_key? $1.to_sym || super
504
- @opts[$1.to_sym] = args[0]
505
- return @opts[$1.to_sym]
506
- else
507
- k = name.to_sym
508
- @opts.has_key? k || super
509
- # puts "Value Found For #{k.to_yaml}"
510
- return @opts[k]
511
-
512
- end
513
- end
514
- end
515
- end
516
-
517
- end
294
+ puts bang.backtrace if $DEBUG
295
+ end
296
+ end
297
+
298
+ end
299
+
300
+
301
+ def submit_link(link, depth)
302
+ # @link_keys[link_key(link)] = link
303
+
304
+ clk = link_key(link, :clear_values => true)
305
+ @link_counts[clk] ||= 0
306
+ @link_counts[clk] += 1
307
+ lk = link_key(link)
308
+ return false if @link_keys.has_key? lk
309
+ @link_keys[lk] = nil
310
+ if @link_counts[clk] < @opts[:max_repeat]
311
+ @link_queue.enq LinkBag.new(link, depth)
312
+ else
313
+ puts "! MAX REPEAT !\nSkipped link #{link}" if $DEBUG
314
+ end
315
+ end
316
+
317
+ def form_key(form, opts={} )
318
+ o = { :clear_values => false }
319
+ o.update opts
320
+
321
+ fp = "#{form.action}"
322
+ fp << form.method
323
+ if form.request_data =~ /=/
324
+ data = form.request_data.split("&").sort.join("&")
325
+ if o[:clear_values]
326
+ fp << data.gsub(/=[^&]*/,'=')
327
+ else
328
+ fp << data
329
+ end
330
+ end
331
+ fkey = Digest::MD5.hexdigest fp
332
+ fkey
333
+ end
334
+
335
+ def send_form(form, depth)
336
+ return false if @engine_status == CRAWL_NONE
337
+ cfk = form_key(form, :clear_values => true)
338
+ @form_counts[cfk] ||= 0
339
+ @form_counts[cfk] += 1
340
+
341
+ # @form_keys[form_key(form)] = form
342
+ fk = form_key(form)
343
+ return false if @form_keys.has_key? fk
344
+ @form_keys[fk] = nil
345
+ begin
346
+ if @form_counts[cfk] < @opts[:max_repeat]
347
+ if form.buttons.length > 0
348
+ p = form.click_button
349
+ else
350
+ p = form.submit()
351
+ end
352
+ puts p.class
353
+ @page_queue.enq PageBag.new(p, depth+1)
354
+ else
355
+ puts "! MAX REPEAT !\nSkipped Form #{form.action}"
356
+ end
357
+ rescue => bang
358
+ puts bang
359
+ puts bang.backtrace
360
+ end
361
+ end
362
+
363
+ def send_form?(form)
364
+ # puts "SEND FORM?"
365
+ return false unless engine_running?
366
+ return false unless @opts[:submit_forms] == true
367
+ # puts "> submit_forms"
368
+ return false unless allowed? form.action
369
+ #puts "> allowed"
370
+ return false unless fields_allowed? form
371
+ #puts "> fields allowed"
372
+ return false if form_sent? form
373
+ # puts "> form not sent"
374
+ return true
375
+ end
376
+
377
+ def follow_link?(link)
378
+ return false unless allowed? link
379
+ return false if link_is_followed? link
380
+ return true
381
+ end
382
+
383
+ def host_allowed?(uri)
384
+ #puts "ALLOWED HOSTS =>"
385
+ #puts allowed_hosts
386
+ #puts "---"
387
+ # puts "Host Allowed?"
388
+ ah = allowed_hosts
389
+ # puts ah.class
390
+ #puts ah
391
+ return false if ah.empty?
392
+ ahc = ah.select{ |h| uri.site =~ /^#{h}$/ }.length
393
+ if ahc > 0
394
+ # puts "> Host IS allowed!"
395
+ return true
396
+ end
397
+ # puts "> Host is NOT allowed!"
398
+ return false
399
+ end
400
+
401
+ def url_allowed?(uri)
402
+ # puts "* excluded_urls"
403
+ # puts exluded_urls
404
+ return false if excluded_urls.select{ |url| uri.path =~ /#{url}/ }.length > 0
405
+ # puts "* allowed_urls"
406
+ # puts allowed_urls
407
+ return true if allowed_urls.empty?
408
+ return true if allowed_urls.select{ |url| uri.path =~ /#{url}/ }.length > 0
409
+ # puts "> URL is NOT allowed"
410
+ return false
411
+ end
412
+
413
+ def path_allowed?(uri)
414
+ return true if root_path.nil?
415
+ return true if root_path.empty?
416
+ return true if uri.path =~ /^#{root_path}/
417
+ # puts "> PATH is NOT ALLOWED"
418
+ return false
419
+ end
420
+
421
+ def cleanup_uri(obj)
422
+ uri = nil
423
+ uri = obj.uri if obj.respond_to? :uri
424
+ uri = URI.parse(obj) if obj.is_a? String
425
+ uri = obj if obj.is_a? URI::HTTP
426
+ uri
427
+ end
428
+
429
+ def allowed?(link)
430
+ valid = false
431
+ # need to handle different link objects, Mechanize::Page::Link and URIs
432
+ uri = nil
433
+ uri = link.uri if link.respond_to? :uri
434
+ uri = URI.parse(link) if link.is_a? String
435
+ uri = link if link.is_a? URI::HTTP
436
+
437
+ return false if uri.nil?
438
+
439
+ host_allowed?(uri) &&
440
+ url_allowed?(uri) &&
441
+ path_allowed?(uri)
442
+ end
443
+
444
+ def form_sent?(form)
445
+
446
+ @form_keys.has_key? form_key(form)
447
+ end
448
+
449
+ def link_key(link, opts={})
450
+ o = { :clear_values => false }
451
+ o.update opts
452
+
453
+ uri = cleanup_uri(link)
454
+
455
+ query_sorted = ""
456
+ query_sorted = uri.query.split("&").sort.join("&") unless uri.query.nil?
457
+
458
+ key = ""
459
+ key << uri.scheme
460
+ key << uri.site
461
+ key << uri.path
462
+ key << query_sorted
463
+ key.gsub!(/=[^&]*/,'=') if o[:clear_values] == true
464
+
465
+ Digest::MD5.hexdigest key
466
+ end
467
+
468
+ def engine_running?
469
+ @status_lock.synchronize do
470
+ return false if @engine_status == CRAWL_NONE
471
+ return true
472
+ end
473
+ end
474
+
475
+ def link_is_followed?(link)
476
+
477
+ return true if @link_keys.has_key? link_key(link)
478
+
479
+ false
480
+ end
481
+
482
+ def fields_allowed?(form)
483
+ form.fields.each do |f|
484
+ excluded_fields.each do |ef|
485
+ return false if f.name =~ /#{ef}/
486
+ end
487
+ end
488
+ return true
489
+ end
490
+
491
+ def method_missing(name, *args, &block)
492
+ # puts "* instance method missing (#{name})"
493
+ if name =~ /(.*)=$/
494
+ @opts.has_key? $1.to_sym || super
495
+ @opts[$1.to_sym] = args[0]
496
+ return @opts[$1.to_sym]
497
+ else
498
+ k = name.to_sym
499
+ @opts.has_key? k || super
500
+ # puts "Value Found For #{k.to_yaml}"
501
+ return @opts[k]
502
+
503
+ end
504
+ end
505
+ end
506
+ end
507
+
508
+ end