watobo 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (283) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +46 -1
  3. data/bin/nfq_server.rb +0 -9
  4. data/bin/watobo_gui.rb +3 -13
  5. data/custom-views/prettify-json.rb +9 -18
  6. data/icons/watobo.ico +0 -0
  7. data/icons/watobo.ico.old +0 -0
  8. data/lib/watobo.rb +10 -19
  9. data/lib/watobo/adapters.rb +5 -14
  10. data/lib/watobo/adapters/data_store.rb +50 -59
  11. data/lib/watobo/adapters/file/file_store.rb +287 -296
  12. data/lib/watobo/adapters/file/marshal_store.rb +293 -296
  13. data/lib/watobo/adapters/session_store.rb +5 -14
  14. data/lib/watobo/ca.rb +1 -10
  15. data/lib/watobo/config.rb +197 -206
  16. data/lib/watobo/constants.rb +0 -9
  17. data/lib/watobo/core.rb +3 -12
  18. data/lib/watobo/core/active_check.rb +72 -135
  19. data/lib/watobo/core/active_checks.rb +49 -58
  20. data/lib/watobo/core/ca.rb +369 -389
  21. data/lib/watobo/core/cert_store.rb +34 -43
  22. data/lib/watobo/core/chat.rb +92 -101
  23. data/lib/watobo/core/chats.rb +271 -280
  24. data/lib/watobo/core/client_cert_store.rb +106 -35
  25. data/lib/watobo/core/conversation.rb +48 -57
  26. data/lib/watobo/core/cookie.rb +23 -32
  27. data/lib/watobo/core/egress_handlers.rb +98 -0
  28. data/lib/watobo/core/finding.rb +66 -75
  29. data/lib/watobo/core/findings.rb +107 -114
  30. data/lib/watobo/core/forwarding_proxy.rb +13 -22
  31. data/lib/watobo/core/fuzz_gen.rb +0 -9
  32. data/lib/watobo/core/intercept_carver.rb +166 -177
  33. data/lib/watobo/core/intercept_filter.rb +235 -244
  34. data/lib/watobo/core/interceptor.rb +98 -107
  35. data/lib/watobo/core/min_class.rb +4 -13
  36. data/lib/watobo/core/netfilter_queue.rb +170 -179
  37. data/lib/watobo/core/ott_cache.rb +132 -141
  38. data/lib/watobo/core/parameter.rb +43 -52
  39. data/lib/watobo/core/passive_check.rb +103 -102
  40. data/lib/watobo/core/passive_checks.rb +48 -57
  41. data/lib/watobo/core/passive_scanner.rb +54 -55
  42. data/lib/watobo/core/plugin.rb +11 -20
  43. data/lib/watobo/core/project.rb +3 -9
  44. data/lib/watobo/core/proxy.rb +43 -52
  45. data/lib/watobo/core/request.rb +125 -123
  46. data/lib/watobo/core/response.rb +44 -53
  47. data/lib/watobo/core/scanner.rb +0 -9
  48. data/lib/watobo/core/scanner3.rb +405 -414
  49. data/lib/watobo/core/scope.rb +83 -92
  50. data/lib/watobo/core/session.rb +1043 -1026
  51. data/lib/watobo/core/sid_cache.rb +98 -107
  52. data/lib/watobo/core/subscriber.rb +25 -34
  53. data/lib/watobo/defaults.rb +21 -30
  54. data/lib/watobo/external/diff/lcs.rb +0 -9
  55. data/lib/watobo/external/diff/lcs/array.rb +0 -9
  56. data/lib/watobo/external/diff/lcs/block.rb +0 -9
  57. data/lib/watobo/external/diff/lcs/callbacks.rb +0 -9
  58. data/lib/watobo/external/diff/lcs/change.rb +0 -9
  59. data/lib/watobo/external/diff/lcs/hunk.rb +0 -9
  60. data/lib/watobo/external/diff/lcs/ldiff.rb +0 -9
  61. data/lib/watobo/external/diff/lcs/string.rb +0 -9
  62. data/lib/watobo/externals.rb +6 -15
  63. data/lib/watobo/framework.rb +4 -13
  64. data/lib/watobo/framework/create_project.rb +60 -69
  65. data/lib/watobo/framework/init.rb +0 -9
  66. data/lib/watobo/framework/init_modules.rb +0 -9
  67. data/lib/watobo/framework/license_text.rb +28 -37
  68. data/lib/watobo/framework/load_chat.rb +13 -22
  69. data/lib/watobo/gui.rb +132 -123
  70. data/lib/watobo/gui/about_watobo.rb +0 -9
  71. data/lib/watobo/gui/browser_preview.rb +0 -9
  72. data/lib/watobo/gui/certificate_dialog.rb +0 -9
  73. data/lib/watobo/gui/chat_diff.rb +0 -9
  74. data/lib/watobo/gui/chatviewer_frame.rb +73 -72
  75. data/lib/watobo/gui/checkboxtree.rb +0 -9
  76. data/lib/watobo/gui/checks_policy_frame.rb +0 -9
  77. data/lib/watobo/gui/client_cert_dialog.rb +96 -87
  78. data/lib/watobo/gui/confirm_scan_dialog.rb +0 -9
  79. data/lib/watobo/gui/conversation_table.rb +158 -164
  80. data/lib/watobo/gui/conversation_table_ctrl.rb +207 -216
  81. data/lib/watobo/gui/conversation_table_ctrl2.rb +373 -382
  82. data/lib/watobo/gui/csrf_token_dialog.rb +0 -9
  83. data/lib/watobo/gui/custom_viewer.rb +374 -383
  84. data/lib/watobo/gui/dashboard.rb +296 -303
  85. data/lib/watobo/gui/define_scope_frame.rb +0 -9
  86. data/lib/watobo/gui/differ_frame.rb +215 -224
  87. data/lib/watobo/gui/edit_comment.rb +0 -9
  88. data/lib/watobo/gui/edit_scope_dialog.rb +0 -9
  89. data/lib/watobo/gui/export_dialog.rb +104 -113
  90. data/lib/watobo/gui/finding_info.rb +0 -9
  91. data/lib/watobo/gui/findings_tree.rb +210 -217
  92. data/lib/watobo/gui/full_scan_dialog.rb +0 -9
  93. data/lib/watobo/gui/fuzzer_gui.rb +1295 -1313
  94. data/lib/watobo/gui/fxsave_thread.rb +14 -0
  95. data/lib/watobo/gui/goto_url_dialog.rb +70 -79
  96. data/lib/watobo/gui/hex_viewer.rb +0 -9
  97. data/lib/watobo/gui/html_viewer.rb +287 -296
  98. data/lib/watobo/gui/intercept_filter_dialog.rb +188 -197
  99. data/lib/watobo/gui/interceptor_gui.rb +1041 -1051
  100. data/lib/watobo/gui/interceptor_settings_dialog.rb +0 -9
  101. data/lib/watobo/gui/json_viewer.rb +287 -0
  102. data/lib/watobo/gui/list_box.rb +101 -110
  103. data/lib/watobo/gui/log_file_viewer.rb +32 -41
  104. data/lib/watobo/gui/log_viewer.rb +83 -88
  105. data/lib/watobo/gui/login_wizzard.rb +0 -9
  106. data/lib/watobo/gui/main_window.rb +587 -618
  107. data/lib/watobo/gui/manual_request_editor.rb +620 -565
  108. data/lib/watobo/gui/master_pw_dialog.rb +0 -9
  109. data/lib/watobo/gui/mixins/gui_settings.rb +29 -38
  110. data/lib/watobo/gui/page_tree.rb +217 -226
  111. data/lib/watobo/gui/password_policy_dialog.rb +0 -9
  112. data/lib/watobo/gui/plugin_board.rb +0 -9
  113. data/lib/watobo/gui/preferences_dialog.rb +0 -9
  114. data/lib/watobo/gui/progress_window.rb +17 -27
  115. data/lib/watobo/gui/project_wizzard.rb +0 -9
  116. data/lib/watobo/gui/proxy_dialog.rb +1 -10
  117. data/lib/watobo/gui/quick_scan_dialog.rb +0 -9
  118. data/lib/watobo/gui/request_builder_frame.rb +102 -111
  119. data/lib/watobo/gui/request_editor.rb +181 -137
  120. data/lib/watobo/gui/rewrite_filters_dialog.rb +394 -403
  121. data/lib/watobo/gui/rewrite_rules_dialog.rb +372 -381
  122. data/lib/watobo/gui/save_chat_dialog.rb +140 -149
  123. data/lib/watobo/gui/scanner_settings_dialog.rb +0 -9
  124. data/lib/watobo/gui/select_chat_dialog.rb +0 -9
  125. data/lib/watobo/gui/session_management_dialog.rb +0 -9
  126. data/lib/watobo/gui/sites_tree.rb +0 -9
  127. data/lib/watobo/gui/status_bar.rb +0 -9
  128. data/lib/watobo/gui/table_editor.rb +0 -9
  129. data/lib/watobo/gui/tagless_viewer.rb +0 -9
  130. data/lib/watobo/gui/templates/plugin.rb +0 -9
  131. data/lib/watobo/gui/templates/plugin2.rb +92 -100
  132. data/lib/watobo/gui/templates/plugin_base.rb +144 -153
  133. data/lib/watobo/gui/text_viewer.rb +0 -9
  134. data/lib/watobo/gui/transcoder_window.rb +0 -9
  135. data/lib/watobo/gui/utils/gui_utils.rb +0 -9
  136. data/lib/watobo/gui/utils/init_icons.rb +86 -95
  137. data/lib/watobo/gui/utils/load_icons.rb +33 -42
  138. data/lib/watobo/gui/utils/load_plugins.rb +116 -119
  139. data/lib/watobo/gui/utils/master_password.rb +68 -77
  140. data/lib/watobo/gui/utils/save_default_settings.rb +113 -122
  141. data/lib/watobo/gui/utils/save_project_settings.rb +0 -9
  142. data/lib/watobo/gui/utils/save_proxy_settings.rb +41 -50
  143. data/lib/watobo/gui/utils/save_scanner_settings.rb +18 -27
  144. data/lib/watobo/gui/utils/session_history.rb +112 -121
  145. data/lib/watobo/gui/workspace_dialog.rb +0 -9
  146. data/lib/watobo/gui/www_auth_dialog.rb +0 -9
  147. data/lib/watobo/gui/xml_viewer_frame.rb +0 -9
  148. data/lib/watobo/http.rb +4 -13
  149. data/lib/watobo/http/cookies/cookies.rb +26 -35
  150. data/lib/watobo/http/data/data.rb +45 -54
  151. data/lib/watobo/http/data/json.rb +47 -55
  152. data/lib/watobo/http/url/url.rb +38 -47
  153. data/lib/watobo/http/xml/xml.rb +124 -130
  154. data/lib/watobo/interceptor.rb +3 -12
  155. data/lib/watobo/interceptor/proxy.rb +742 -739
  156. data/lib/watobo/interceptor/transparent.rb +22 -24
  157. data/lib/watobo/mixins.rb +10 -19
  158. data/lib/watobo/mixins/check_info.rb +27 -36
  159. data/lib/watobo/mixins/httpparser.rb +613 -637
  160. data/lib/watobo/mixins/request_parser.rb +88 -97
  161. data/lib/watobo/mixins/shapers.rb +515 -529
  162. data/lib/watobo/mixins/transcoders.rb +3 -11
  163. data/lib/watobo/parser.rb +1 -10
  164. data/lib/watobo/parser/html.rb +83 -92
  165. data/lib/watobo/patch_fxruby_setfocus.rb +26 -0
  166. data/lib/watobo/sockets.rb +3 -12
  167. data/lib/watobo/sockets/agent.rb +828 -837
  168. data/lib/watobo/sockets/client_socket.rb +308 -312
  169. data/lib/watobo/sockets/connection.rb +401 -410
  170. data/lib/watobo/sockets/http_socket.rb +11 -13
  171. data/lib/watobo/sockets/ntlm_auth.rb +129 -138
  172. data/lib/watobo/utils.rb +10 -19
  173. data/lib/watobo/utils/check_regex.rb +0 -9
  174. data/lib/watobo/utils/copy_object.rb +0 -9
  175. data/lib/watobo/utils/crypto.rb +0 -9
  176. data/lib/watobo/utils/expand_range.rb +23 -32
  177. data/lib/watobo/utils/export_xml.rb +97 -106
  178. data/lib/watobo/utils/file_management.rb +9 -11
  179. data/lib/watobo/utils/hexprint.rb +9 -18
  180. data/lib/watobo/utils/load_chat.rb +0 -9
  181. data/lib/watobo/utils/load_icon.rb +0 -9
  182. data/lib/watobo/utils/ntlm.rb +866 -875
  183. data/lib/watobo/utils/print_debug.rb +12 -21
  184. data/lib/watobo/utils/response_builder.rb +90 -99
  185. data/lib/watobo/utils/response_hash.rb +0 -9
  186. data/lib/watobo/utils/secure_eval.rb +0 -9
  187. data/lib/watobo/utils/strings.rb +10 -19
  188. data/lib/watobo/utils/text2request.rb +0 -9
  189. data/lib/watobo/utils/url.rb +23 -32
  190. data/lib/watobo/utils/utf16.rb +11 -20
  191. data/modules/active/Apache/mod_status.rb +0 -9
  192. data/modules/active/Apache/multiview.rb +151 -160
  193. data/modules/active/Flash/crossdomain.rb +0 -9
  194. data/modules/active/JWT/jwt_oauth2_none.rb +111 -0
  195. data/modules/active/cq5/cq5_default_selectors.rb +106 -115
  196. data/modules/active/cq5/cqp_user_enumeration.rb +125 -134
  197. data/modules/active/directories/dirwalker.rb +0 -9
  198. data/modules/active/discovery/fileextensions.rb +0 -9
  199. data/modules/active/discovery/http_methods.rb +0 -9
  200. data/modules/active/discovery/jsmapfiles.rb +79 -0
  201. data/modules/active/domino/domino_db.rb +68 -76
  202. data/modules/active/dotNET/custom_errors.rb +102 -111
  203. data/modules/active/dotNET/dotnet_files.rb +90 -99
  204. data/modules/active/fileinclusion/lfi_simple.rb +0 -9
  205. data/modules/active/jboss/jboss_basic.rb +0 -9
  206. data/modules/active/sap/business_objects.rb +51 -60
  207. data/modules/active/sap/its_commands.rb +0 -9
  208. data/modules/active/sap/its_service_parameter.rb +0 -9
  209. data/modules/active/sap/its_services.rb +0 -9
  210. data/modules/active/sap/its_xss.rb +0 -9
  211. data/modules/active/shell_shock/shell_shock.rb +139 -148
  212. data/modules/active/siebel/siebel_apps.rb +160 -169
  213. data/modules/active/sqlinjection/sql_boolean.rb +0 -9
  214. data/modules/active/sqlinjection/sql_numerical.rb +198 -0
  215. data/modules/active/sqlinjection/sqli_error.rb +0 -9
  216. data/modules/active/sqlinjection/sqli_timing.rb +220 -229
  217. data/modules/active/struts2/default_handler_ognl.rb +106 -115
  218. data/modules/active/struts2/include_params_ognl.rb +105 -114
  219. data/modules/active/xml/xml_xxe.rb +112 -123
  220. data/modules/active/xss/xss_ng.rb +214 -223
  221. data/modules/active/xss/xss_simple.rb +0 -9
  222. data/modules/passive/ajax.rb +68 -77
  223. data/modules/passive/autocomplete.rb +56 -65
  224. data/modules/passive/cookie_options.rb +0 -9
  225. data/modules/passive/cookie_xss.rb +0 -9
  226. data/modules/passive/detect_code.rb +0 -9
  227. data/modules/passive/detect_fileupload.rb +0 -9
  228. data/modules/passive/detect_infrastructure.rb +0 -9
  229. data/modules/passive/detect_one_time_tokens.rb +0 -9
  230. data/modules/passive/dirindexing.rb +0 -9
  231. data/modules/passive/disclosure_domino.rb +55 -64
  232. data/modules/passive/disclosure_emails.rb +0 -9
  233. data/modules/passive/disclosure_ipaddr.rb +55 -53
  234. data/modules/passive/filename_as_parameter.rb +0 -9
  235. data/modules/passive/form_spotter.rb +0 -9
  236. data/modules/passive/hidden_fields.rb +50 -59
  237. data/modules/passive/hotspots.rb +0 -9
  238. data/modules/passive/in_script_parameter.rb +0 -9
  239. data/modules/passive/json_web_token.rb +93 -0
  240. data/modules/passive/multiple_server_headers.rb +0 -9
  241. data/modules/passive/possible_login.rb +0 -9
  242. data/modules/passive/redirect_url.rb +0 -9
  243. data/modules/passive/redirectionz.rb +0 -9
  244. data/modules/passive/sap-headers.rb +56 -65
  245. data/modules/passive/xss_dom.rb +0 -9
  246. data/plugins/aem/aem.rb +11 -20
  247. data/plugins/aem/gui/main.rb +118 -127
  248. data/plugins/aem/gui/tree_view.rb +171 -180
  249. data/plugins/aem/lib/agent.rb +130 -138
  250. data/plugins/aem/lib/dispatcher.rb +45 -51
  251. data/plugins/aem/lib/engine.rb +177 -186
  252. data/plugins/catalog/catalog.rb +345 -355
  253. data/plugins/crawler/crawler.rb +4 -13
  254. data/plugins/crawler/gui.rb +5 -14
  255. data/plugins/crawler/gui/auth_frame.rb +270 -279
  256. data/plugins/crawler/gui/crawler_gui.rb +271 -276
  257. data/plugins/crawler/gui/general_settings_frame.rb +96 -105
  258. data/plugins/crawler/gui/hooks_frame.rb +80 -89
  259. data/plugins/crawler/gui/scope_frame.rb +50 -59
  260. data/plugins/crawler/gui/settings_tabbook.rb +38 -47
  261. data/plugins/crawler/gui/status_frame.rb +59 -68
  262. data/plugins/crawler/lib/bags.rb +18 -27
  263. data/plugins/crawler/lib/constants.rb +11 -20
  264. data/plugins/crawler/lib/engine.rb +488 -497
  265. data/plugins/crawler/lib/grabber.rb +68 -77
  266. data/plugins/crawler/lib/status.rb +71 -80
  267. data/plugins/crawler/lib/uri_mp.rb +12 -21
  268. data/plugins/filefinder/filefinder.rb +326 -333
  269. data/plugins/sqlmap/bin/test.rb +78 -87
  270. data/plugins/sqlmap/gui.rb +4 -13
  271. data/plugins/sqlmap/gui/main.rb +218 -227
  272. data/plugins/sqlmap/gui/options_frame.rb +97 -106
  273. data/plugins/sqlmap/lib/sqlmap_ctrl.rb +90 -100
  274. data/plugins/sqlmap/sqlmap.rb +2 -11
  275. data/plugins/sslchecker/cli/sslchecker_cli.rb +0 -9
  276. data/plugins/sslchecker/gui/cipher_table.rb +246 -254
  277. data/plugins/sslchecker/gui/gui.rb +258 -264
  278. data/plugins/sslchecker/gui/sslchecker.rb +4 -13
  279. data/plugins/sslchecker/lib/check.rb +127 -133
  280. data/plugins/wshell/gui/main.rb +119 -117
  281. data/plugins/wshell/lib/core.rb +38 -88
  282. data/plugins/wshell/wshell.rb +11 -20
  283. metadata +170 -164
@@ -1,49 +1,40 @@
1
- #.
2
- # settings_tabbook.rb
3
- #.
4
- # Copyright 2014 by siberas, http://www.siberas.de
5
- # This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
6
- # WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
7
- # WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
8
- # You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
9
-
10
1
  # @private
11
- module Watobo#:nodoc: all
12
- module Plugin
13
- module Crawler
14
- class Gui
15
- class SettingsTabBook < FXTabBook
16
- attr :hooks, :general, :log_viewer, :auth, :scope
17
-
18
-
19
-
20
- def initialize(owner)
21
- #@tab = FXTabBook.new(self, nil, 0, LAYOUT_FILL_X|LAYOUT_FILL_Y|LAYOUT_RIGHT)
22
- super(owner, nil, 0, LAYOUT_FILL_X|LAYOUT_FILL_Y|LAYOUT_RIGHT)
23
- FXTabItem.new(self, "General", nil)
24
- # frame = FXVerticalFrame.new(self, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_RAISED)
25
- @general = GeneralSettingsFrame.new(self)
26
-
27
- FXTabItem.new(self, "Scope", nil)
28
- @scope = ScopeFrame.new(self)
29
-
30
- FXTabItem.new(self, "Auth", nil)
31
- @auth = AuthFrame.new(self)
32
-
33
-
34
- FXTabItem.new(self, "Hooks", nil)
35
- @hooks = HooksFrame.new(self)
36
-
37
- FXTabItem.new(self, "Log", nil)
38
- frame = FXVerticalFrame.new(self, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_THICK|FRAME_RAISED)
39
- @log_viewer = Watobo::Gui::LogViewer.new(frame, :append, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_SUNKEN)
40
-
41
- self.connect(SEL_COMMAND){
42
- @hooks.selected if self.current == 3
43
- }
44
- end
45
- end
46
- end
47
- end
48
- end
2
+ module Watobo#:nodoc: all
3
+ module Plugin
4
+ module Crawler
5
+ class Gui
6
+ class SettingsTabBook < FXTabBook
7
+ attr :hooks, :general, :log_viewer, :auth, :scope
8
+
9
+
10
+
11
+ def initialize(owner)
12
+ #@tab = FXTabBook.new(self, nil, 0, LAYOUT_FILL_X|LAYOUT_FILL_Y|LAYOUT_RIGHT)
13
+ super(owner, nil, 0, LAYOUT_FILL_X|LAYOUT_FILL_Y|LAYOUT_RIGHT)
14
+ FXTabItem.new(self, "General", nil)
15
+ # frame = FXVerticalFrame.new(self, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_RAISED)
16
+ @general = GeneralSettingsFrame.new(self)
17
+
18
+ FXTabItem.new(self, "Scope", nil)
19
+ @scope = ScopeFrame.new(self)
20
+
21
+ FXTabItem.new(self, "Auth", nil)
22
+ @auth = AuthFrame.new(self)
23
+
24
+
25
+ FXTabItem.new(self, "Hooks", nil)
26
+ @hooks = HooksFrame.new(self)
27
+
28
+ FXTabItem.new(self, "Log", nil)
29
+ frame = FXVerticalFrame.new(self, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_THICK|FRAME_RAISED)
30
+ @log_viewer = Watobo::Gui::LogViewer.new(frame, :append, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_SUNKEN)
31
+
32
+ self.connect(SEL_COMMAND){
33
+ @hooks.selected if self.current == 3
34
+ }
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
49
40
  end
@@ -1,71 +1,62 @@
1
- #.
2
- # status_frame.rb
3
- #.
4
- # Copyright 2014 by siberas, http://www.siberas.de
5
- # This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
6
- # WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
7
- # WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
8
- # You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
9
-
10
1
  # @private
11
- module Watobo#:nodoc: all
12
- module Plugin
13
- module Crawler
14
- class Gui
15
- class StatusFrame < FXHorizontalFrame
16
-
17
- include Watobo::Plugin::Crawler::Constants
18
- # :engine_status => CRAWL_NONE,
19
- # :page_size => 0,
20
- # :link_size => 0,
21
- # :skipped_domains => 0
2
+ module Watobo#:nodoc: all
3
+ module Plugin
4
+ module Crawler
5
+ class Gui
6
+ class StatusFrame < FXHorizontalFrame
7
+
8
+ include Watobo::Plugin::Crawler::Constants
9
+ # :engine_status => CRAWL_NONE,
10
+ # :page_size => 0,
11
+ # :link_size => 0,
12
+ # :skipped_domains => 0
22
13
  def update_status(status)
23
- #puts status.to_yaml
24
- if status.has_key? :engine_status
25
- case status[:engine_status]
26
- when CRAWL_NONE
27
- self.backColor = self.parent.backColor
28
- @status_txt.text = "Status: Idle"
29
- when CRAWL_RUNNING
30
- self.backColor = FXColor::Red
31
- @status_txt.text = "Status: Running"
32
-
33
- when CRAWL_PAUSED
34
- self.backColor = FXColor::Yellow
35
- @status_txt.text = "Status: Paused"
36
- end
37
- end
38
-
39
- if status.has_key? :link_size
40
- @link_size_txt.text = "Links: #{status[:link_size]}"
41
- end
42
-
43
- if status.has_key? :page_size
44
- @page_size_txt.text = "Pages: #{status[:page_size]}"
45
- end
46
-
47
- if status.has_key? :total_requests
48
- @requests_txt.text = "Requests: #{status[:total_requests]}"
49
- end
50
- end
51
-
52
- def initialize(owner)
53
- super(owner, :opts => LAYOUT_FILL_X|FRAME_RAISED)
54
- @info_fields = []
55
- #frame = FXHorizontalFrame.new(, :opts => LAYOUT_FILL_Y, :padding => 0)
56
- frame = self
57
- @info_fields << ( @status_txt = FXLabel.new(frame, "Status: Stopped", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 100) )
58
- @info_fields << (@link_size_txt = FXLabel.new(frame, "Links: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 70) )
59
- @info_fields << (@page_size_txt = FXLabel.new(frame, "Pages: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 70) )
60
- @info_fields << (@requests_txt = FXLabel.new(frame, "Requests: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 100) )
61
-
62
- @info_fields.each do |i|
63
- i.justify = JUSTIFY_LEFT
64
- end
65
- end
66
-
67
- end
68
- end
69
- end
70
- end
14
+ #puts status.to_yaml
15
+ if status.has_key? :engine_status
16
+ case status[:engine_status]
17
+ when CRAWL_NONE
18
+ self.backColor = self.parent.backColor
19
+ @status_txt.text = "Status: Idle"
20
+ when CRAWL_RUNNING
21
+ self.backColor = FXColor::Red
22
+ @status_txt.text = "Status: Running"
23
+
24
+ when CRAWL_PAUSED
25
+ self.backColor = FXColor::Yellow
26
+ @status_txt.text = "Status: Paused"
27
+ end
28
+ end
29
+
30
+ if status.has_key? :link_size
31
+ @link_size_txt.text = "Links: #{status[:link_size]}"
32
+ end
33
+
34
+ if status.has_key? :page_size
35
+ @page_size_txt.text = "Pages: #{status[:page_size]}"
36
+ end
37
+
38
+ if status.has_key? :total_requests
39
+ @requests_txt.text = "Requests: #{status[:total_requests]}"
40
+ end
41
+ end
42
+
43
+ def initialize(owner)
44
+ super(owner, :opts => LAYOUT_FILL_X|FRAME_RAISED)
45
+ @info_fields = []
46
+ #frame = FXHorizontalFrame.new(, :opts => LAYOUT_FILL_Y, :padding => 0)
47
+ frame = self
48
+ @info_fields << ( @status_txt = FXLabel.new(frame, "Status: Stopped", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 100) )
49
+ @info_fields << (@link_size_txt = FXLabel.new(frame, "Links: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 70) )
50
+ @info_fields << (@page_size_txt = FXLabel.new(frame, "Pages: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 70) )
51
+ @info_fields << (@requests_txt = FXLabel.new(frame, "Requests: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 100) )
52
+
53
+ @info_fields.each do |i|
54
+ i.justify = JUSTIFY_LEFT
55
+ end
56
+ end
57
+
58
+ end
59
+ end
60
+ end
61
+ end
71
62
  end
@@ -1,29 +1,20 @@
1
- #.
2
- # bags.rb
3
- #.
4
- # Copyright 2014 by siberas, http://www.siberas.de
5
- # This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
6
- # WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
7
- # WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
8
- # You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
9
-
10
1
  # @private
11
- module Watobo#:nodoc: all
12
- module Crawler
13
- class PageBag
14
- attr :page, :depth
15
- def initialize(page, depth)
16
- @page = page
17
- @depth = depth
18
- end
19
- end
20
-
21
- class LinkBag
22
- attr :link, :depth
23
- def initialize(link, depth)
24
- @link = link
25
- @depth = depth
26
- end
27
- end
28
- end
2
+ module Watobo#:nodoc: all
3
+ module Crawler
4
+ class PageBag
5
+ attr :page, :depth
6
+ def initialize(page, depth)
7
+ @page = page
8
+ @depth = depth
9
+ end
10
+ end
11
+
12
+ class LinkBag
13
+ attr :link, :depth
14
+ def initialize(link, depth)
15
+ @link = link
16
+ @depth = depth
17
+ end
18
+ end
19
+ end
29
20
  end
@@ -1,22 +1,13 @@
1
- #.
2
- # constants.rb
3
- #.
4
- # Copyright 2014 by siberas, http://www.siberas.de
5
- # This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
6
- # WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
7
- # WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
8
- # You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
9
-
10
1
  # @private
11
- module Watobo#:nodoc: all
12
- module Plugin
13
- module Crawler
14
- module Constants
15
- CRAWL_NONE = 0x00
16
- CRAWL_RUNNING = 0x01
17
- CRAWL_PAUSED = 0x02
18
-
19
- end
20
- end
21
- end
2
+ module Watobo#:nodoc: all
3
+ module Plugin
4
+ module Crawler
5
+ module Constants
6
+ CRAWL_NONE = 0x00
7
+ CRAWL_RUNNING = 0x01
8
+ CRAWL_PAUSED = 0x02
9
+
10
+ end
11
+ end
12
+ end
22
13
  end
@@ -1,517 +1,508 @@
1
- #.
2
- # engine.rb
3
- #.
4
- # Copyright 2014 by siberas, http://www.siberas.de
5
- # This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
6
- # WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
7
- # WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
8
- # You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
9
-
10
1
  # @private
11
- module Watobo#:nodoc: all
12
- module Crawler
13
-
14
- class Agent < Mechanize
15
-
16
- def initialize(opts)
17
- super()
18
-
19
-
20
- self.verify_mode = OpenSSL::SSL::VERIFY_NONE
21
- self.ignore_bad_chunking = true
22
- self.keep_alive = false
23
-
24
- self.user_agent = opts[:user_agent] if opts.has_key?(:user_agent)
25
-
26
- if opts.has_key? :username and opts.has_key? :password
27
- unless opts[:username].empty? and opts[:password].empty?
28
-
29
- user = opts[:username]
30
- pw = opts[:password]
31
- uri = opts[:auth_uri]
32
- # puts "Got Credentials for #{uri}: #{user} / #{pw}"
33
- self.add_auth(uri, user , pw )
34
- # TODO: remove this workaround for a Mechanize Bug (#243)
35
- p = self.get uri
36
- end
37
- end
38
-
39
- if ( opts.has_key? :proxy_host ) && ( opts.has_key? :proxy_port )
40
- self.set_proxy( opts[:proxy_host], opts[:proxy_port] )
41
- end
42
-
43
- if opts.has_key? :pre_connect_hook
44
- self.pre_connect_hooks << opts[:pre_connect_hook] if opts[:pre_connect_hook].respond_to? :call
45
- end
46
-
47
- unless opts[:cookie_jar].nil?
48
- clean_jar = Mechanize::CookieJar.new
49
- opts[:cookie_jar].each{ |cookie|
50
- clean_jar.add! cookie
51
- }
52
- self.cookie_jar = clean_jar
53
- end
54
-
55
- end
56
-
57
- end
58
-
59
- class Engine
60
- include Watobo::Plugin::Crawler::Constants
61
-
62
- def subscribe(event, &callback)
63
- (@event_dispatcher_listeners[event] ||= []) << callback
64
- end
65
-
66
- def clearEvents(event)
67
- @event_dispatcher_listeners[event] ||= []
68
- @event_dispatcher_listeners[event].clear
69
- end
70
-
71
- def notify(event, *args)
72
- if @event_dispatcher_listeners[event]
73
- # puts "NOTIFY: #{self}(:#{event}) [#{@event_dispatcher_listeners[event].length}]" if $DEBUG
74
- @event_dispatcher_listeners[event].each do |m|
75
- m.call(*args) if m.respond_to? :call
76
- end
77
- end
78
- end
79
-
80
- def settings
81
- @opts
82
- end
83
-
84
-
85
-
86
- def get_page(url, opts={})
87
- ro = {}.update @opts
88
- ro.update opts
89
- agent = Crawler::Agent.new(ro)
90
- page = nil
91
- page = agent.get url
92
- return agent, page
93
- end
94
-
95
- def initialize(opts={})
96
- @event_dispatcher_listeners = Hash.new
97
- @status_lock = Mutex.new
98
-
99
- @opts = {
100
- :submit_forms => true,
101
- :max_depth => 5,
102
- :max_repeat => 20,
103
- :max_threads => 4,
104
- :user_agent => "watobo-crawler",
105
- :proxy_host => '127.0.0.1',
106
- :proxy_port => Watobo::Conf::Interceptor.port,
107
- :delay => 0,
108
- :head_request_pattern => '(pdf|swf|doc|flv|jpg|png|gif)',
109
- :allowed_hosts => [], # regex's
110
- :allowed_urls => [], # regex's
111
- :excluded_urls => ["logout"], # regex's
112
- :excluded_fields => ["userid","username","password"], # regex's'
113
- :excluded_form_names => [], # regex's'
114
- :root_path => "", # regex
115
- :username => "",
116
- :password => "",
117
- :auth_uri => nil,
118
- :auth_domain => "", # for ntlm auth
119
- :cookie_jar => nil
120
- }
121
-
122
- @opts.update opts
123
- @opts[:head_request_pattern] = '' if @opts[:head_request_pattern].nil?
124
-
125
- @stats = {
126
- :total_requests => 0
127
- }
128
-
129
- @link_keys = Hash.new
130
- @link_counts = Hash.new
131
-
132
- @form_keys = Hash.new
133
- @form_counts = Hash.new
134
-
135
- end
136
-
137
- def pause
138
- false
139
- end
140
-
141
- def cancel
142
- puts "[CRAWLER] - CANCEL!!"
143
- #@status_lock.synchronize do
144
- # @engine_status = CRAWL_NONE
2
+ module Watobo#:nodoc: all
3
+ module Crawler
4
+
5
+ class Agent < Mechanize
6
+
7
+ def initialize(opts)
8
+ super()
9
+
10
+
11
+ self.verify_mode = OpenSSL::SSL::VERIFY_NONE
12
+ self.ignore_bad_chunking = true
13
+ self.keep_alive = false
14
+
15
+ self.user_agent = opts[:user_agent] if opts.has_key?(:user_agent)
16
+
17
+ if opts.has_key? :username and opts.has_key? :password
18
+ unless opts[:username].empty? and opts[:password].empty?
19
+
20
+ user = opts[:username]
21
+ pw = opts[:password]
22
+ uri = opts[:auth_uri]
23
+ # puts "Got Credentials for #{uri}: #{user} / #{pw}"
24
+ self.add_auth(uri, user , pw )
25
+ # TODO: remove this workaround for a Mechanize Bug (#243)
26
+ p = self.get uri
27
+ end
28
+ end
29
+
30
+ if ( opts.has_key? :proxy_host ) && ( opts.has_key? :proxy_port )
31
+ self.set_proxy( opts[:proxy_host], opts[:proxy_port] )
32
+ end
33
+
34
+ if opts.has_key? :pre_connect_hook
35
+ self.pre_connect_hooks << opts[:pre_connect_hook] if opts[:pre_connect_hook].respond_to? :call
36
+ end
37
+
38
+ unless opts[:cookie_jar].nil?
39
+ clean_jar = Mechanize::CookieJar.new
40
+ opts[:cookie_jar].each{ |cookie|
41
+ clean_jar.add! cookie
42
+ }
43
+ self.cookie_jar = clean_jar
44
+ end
45
+
46
+ end
47
+
48
+ end
49
+
50
+ class Engine
51
+ include Watobo::Plugin::Crawler::Constants
52
+
53
+ def subscribe(event, &callback)
54
+ (@event_dispatcher_listeners[event] ||= []) << callback
55
+ end
56
+
57
+ def clearEvents(event)
58
+ @event_dispatcher_listeners[event] ||= []
59
+ @event_dispatcher_listeners[event].clear
60
+ end
61
+
62
+ def notify(event, *args)
63
+ if @event_dispatcher_listeners[event]
64
+ # puts "NOTIFY: #{self}(:#{event}) [#{@event_dispatcher_listeners[event].length}]" if $DEBUG
65
+ @event_dispatcher_listeners[event].each do |m|
66
+ m.call(*args) if m.respond_to? :call
67
+ end
68
+ end
69
+ end
70
+
71
+ def settings
72
+ @opts
73
+ end
74
+
75
+
76
+
77
+ def get_page(url, opts={})
78
+ ro = {}.update @opts
79
+ ro.update opts
80
+ agent = Crawler::Agent.new(ro)
81
+ page = nil
82
+ page = agent.get url
83
+ return agent, page
84
+ end
85
+
86
+ def initialize(opts={})
87
+ @event_dispatcher_listeners = Hash.new
88
+ @status_lock = Mutex.new
89
+
90
+ @opts = {
91
+ :submit_forms => true,
92
+ :max_depth => 5,
93
+ :max_repeat => 20,
94
+ :max_threads => 4,
95
+ :user_agent => "watobo-crawler",
96
+ :proxy_host => '127.0.0.1',
97
+ :proxy_port => Watobo::Conf::Interceptor.port,
98
+ :delay => 0,
99
+ :head_request_pattern => '(pdf|swf|doc|flv|jpg|png|gif)',
100
+ :allowed_hosts => [], # regex's
101
+ :allowed_urls => [], # regex's
102
+ :excluded_urls => ["logout"], # regex's
103
+ :excluded_fields => ["userid","username","password"], # regex's'
104
+ :excluded_form_names => [], # regex's'
105
+ :root_path => "", # regex
106
+ :username => "",
107
+ :password => "",
108
+ :auth_uri => nil,
109
+ :auth_domain => "", # for ntlm auth
110
+ :cookie_jar => nil
111
+ }
112
+
113
+ @opts.update opts
114
+ @opts[:head_request_pattern] = '' if @opts[:head_request_pattern].nil?
115
+
116
+ @stats = {
117
+ :total_requests => 0
118
+ }
119
+
120
+ @link_keys = Hash.new
121
+ @link_counts = Hash.new
122
+
123
+ @form_keys = Hash.new
124
+ @form_counts = Hash.new
125
+
126
+ end
127
+
128
+ def pause
129
+ false
130
+ end
131
+
132
+ def cancel
133
+ puts "[CRAWLER] - CANCEL!!"
134
+ #@status_lock.synchronize do
135
+ # @engine_status = CRAWL_NONE
145
136
  #end
146
- Watobo::Crawler::Status.engine = CRAWL_NONE
147
- @grabber_threads.each do |gt|
148
- puts "Killing Thread #{gt}"
149
- gt.kill
150
- gt.raise "CANCEL"
151
- end
152
- @grabber_threads.each{|t| t.join }
153
-
154
- @link_queue.clear
155
- @page_queue.clear
156
- @grabber_threads.clear
157
- @link_keys.clear
158
- @link_counts.clear
159
-
160
- @form_keys.clear
161
- @form_counts.clear
162
-
163
- #notify( :update_status, current_status )
164
- puts "CANCELED - CANCELED"
165
- # exit
166
- end
167
-
137
+ Watobo::Crawler::Status.engine = CRAWL_NONE
138
+ @grabber_threads.each do |gt|
139
+ puts "Killing Thread #{gt}"
140
+ gt.kill
141
+ gt.raise "CANCEL"
142
+ end
143
+ @grabber_threads.each{|t| t.join }
144
+
145
+ @link_queue.clear
146
+ @page_queue.clear
147
+ @grabber_threads.clear
148
+ @link_keys.clear
149
+ @link_counts.clear
150
+
151
+ @form_keys.clear
152
+ @form_counts.clear
153
+
154
+ #notify( :update_status, current_status )
155
+ puts "CANCELED - CANCELED"
156
+ # exit
157
+ end
158
+
168
159
  def run(url, opts={})
169
160
  #engine_status = CRAWL_RUNNING
170
161
  Watobo::Crawler::Status.reset
171
162
  Watobo::Crawler::Status.engine = CRAWL_RUNNING
172
-
173
- @opts.update opts
163
+
164
+ @opts.update opts
174
165
  @opts[:head_request_pattern] = '' if @opts[:head_request_pattern].nil?
175
166
 
176
167
  puts "crawler settings:"
177
168
  puts @opts.to_json
178
-
179
-
180
- @link_queue = Queue.new
181
- @page_queue = Queue.new
169
+
170
+
171
+ @link_queue = Queue.new
172
+ @page_queue = Queue.new
173
+
174
+ @link_keys = Hash.new
175
+ @link_counts = Hash.new
182
176
 
183
- @link_keys = Hash.new
184
- @link_counts = Hash.new
185
-
186
- @form_keys = Hash.new
187
- @form_counts = Hash.new
188
-
189
- @skipped_sites = Hash.new
190
-
191
- @grabber_threads = []
192
- start_link = URI.parse url
193
- return false if start_link.host.nil?
194
-
195
- allow_host(start_link)
196
-
197
- @link_queue.enq LinkBag.new(start_link, 0)
198
-
199
-
200
- notify(:log, "Crawling #{url} started ..." )
201
-
202
- @opts[:max_threads].times do |i|
203
- g = Grabber.new(@link_queue, @page_queue, @opts )
204
- @grabber_threads << g.run
205
- end
206
-
207
- puts "* startet #{@grabber_threads.length} grabbers"
208
-
209
- loop do
210
- pagebag = @page_queue.deq
211
-
212
- process_links(pagebag)
213
-
214
- process_forms(pagebag)
177
+ @form_keys = Hash.new
178
+ @form_counts = Hash.new
179
+
180
+ @skipped_sites = Hash.new
181
+
182
+ @grabber_threads = []
183
+ start_link = URI.parse url
184
+ return false if start_link.host.nil?
185
+
186
+ allow_host(start_link)
187
+
188
+ @link_queue.enq LinkBag.new(start_link, 0)
189
+
190
+
191
+ notify(:log, "Crawling #{url} started ..." )
192
+
193
+ @opts[:max_threads].times do |i|
194
+ g = Grabber.new(@link_queue, @page_queue, @opts )
195
+ @grabber_threads << g.run
196
+ end
197
+
198
+ puts "* startet #{@grabber_threads.length} grabbers"
199
+
200
+ loop do
201
+ pagebag = @page_queue.deq
202
+
203
+ process_links(pagebag)
204
+
205
+ process_forms(pagebag)
215
206
  #@stats[:total_requests] += 1 unless pagebag.nil?
216
207
  Watobo::Crawler::Status.inc_requests() unless pagebag.nil?
217
208
  Watobo::Crawler::Status.page_size= @page_queue.size
218
- Watobo::Crawler::Status.link_size= @link_queue.size
219
-
220
- puts "Links/Pages: #{@link_queue.size}/#{@page_queue.size}"
221
- #notify( :update_status, current_status )
222
- # if @link_queue.empty? and @page_queue.empty?
223
- if @page_queue.empty?
224
- # if page_queue is empty wait for all grabber threads finishing the link_queue
225
- until @link_queue.num_waiting == @grabber_threads.length
226
- Thread.pass
227
- end
228
- # when the link_queue is finished check the page_queue. Crawling is finished if page_queue is empty too.
229
- if @page_queue.empty?
230
- @grabber_threads.each { |t| t.kill }
231
- puts "Finished Crawling"
209
+ Watobo::Crawler::Status.link_size= @link_queue.size
210
+
211
+ puts "Links/Pages: #{@link_queue.size}/#{@page_queue.size}"
212
+ #notify( :update_status, current_status )
213
+ # if @link_queue.empty? and @page_queue.empty?
214
+ if @page_queue.empty?
215
+ # if page_queue is empty wait for all grabber threads finishing the link_queue
216
+ until @link_queue.num_waiting == @grabber_threads.length
217
+ Thread.pass
218
+ end
219
+ # when the link_queue is finished check the page_queue. Crawling is finished if page_queue is empty too.
220
+ if @page_queue.empty?
221
+ @grabber_threads.each { |t| t.kill }
222
+ puts "Finished Crawling"
232
223
  #@status_lock.synchronize{ @engine_status = CRAWL_NONE }
233
224
  Watobo::Crawler::Status.engine = CRAWL_NONE
234
-
235
- notify(:log, "Crawling finished")
236
- #notify( :update_status, current_status )
237
- break
238
-
239
- end
240
- end
241
-
242
- end
243
-
244
- end
245
-
246
- private
247
-
248
- def current_status
225
+
226
+ notify(:log, "Crawling finished")
227
+ #notify( :update_status, current_status )
228
+ break
229
+
230
+ end
231
+ end
232
+
233
+ end
234
+
235
+ end
236
+
237
+ private
238
+
239
+ def current_status
249
240
  {
250
- :engine_status => @engine_status,
251
- :link_size => @link_queue.size,
252
- :page_size => @page_queue.size
253
- }.update @stats
254
-
255
- end
256
-
257
-
258
- def allow_host(uri)
259
- if uri.is_a? URI
260
- site = uri.site.to_s
261
- # puts "Valid Site: #{site}"
262
- ah = allowed_hosts
263
- ah << site
264
- end
265
- end
266
-
267
- def process_forms(pagebag)
268
- return false unless pagebag.respond_to? :page
269
- page=pagebag.page
270
- return false unless page.respond_to? :forms
271
- page.forms.each do |f|
272
-
273
- action = page.uri.merge f.action unless f.action =~ /^http/
274
- f.action = action.to_s
275
-
276
- if send_form? f
277
- # puts "SUBMIT FORM: #{f.action}"
278
- send_form(f, pagebag.depth)
279
- end
280
- end
281
- end
282
-
283
- def process_links(pagebag)
284
- return false unless pagebag.respond_to? :page
285
- page = pagebag.page
286
- return false unless page.respond_to? :links
287
-
288
- page.links.each do |l|
289
- begin
241
+ :engine_status => @engine_status,
242
+ :link_size => @link_queue.size,
243
+ :page_size => @page_queue.size
244
+ }.update @stats
245
+
246
+ end
247
+
248
+
249
+ def allow_host(uri)
250
+ if uri.is_a? URI
251
+ site = uri.site.to_s
252
+ # puts "Valid Site: #{site}"
253
+ ah = allowed_hosts
254
+ ah << site
255
+ end
256
+ end
257
+
258
+ def process_forms(pagebag)
259
+ return false unless pagebag.respond_to? :page
260
+ page=pagebag.page
261
+ return false unless page.respond_to? :forms
262
+ page.forms.each do |f|
263
+
264
+ action = page.uri.merge f.action unless f.action =~ /^http/
265
+ f.action = action.to_s
266
+
267
+ if send_form? f
268
+ # puts "SUBMIT FORM: #{f.action}"
269
+ send_form(f, pagebag.depth)
270
+ end
271
+ end
272
+ end
273
+
274
+ def process_links(pagebag)
275
+ return false unless pagebag.respond_to? :page
276
+ page = pagebag.page
277
+ return false unless page.respond_to? :links
278
+
279
+ page.links.each do |l|
280
+ begin
290
281
  link = l
291
282
  next if l.href.nil?
292
-
293
- link = page.uri.merge l.uri unless l.href =~ /^http/
294
- # puts "FOLLOW LINK #{link} ?"
295
- if follow_link? link
296
- # puts ">> OK"
297
- submit_link(link, pagebag.depth)
298
- else
299
- # puts ">> NO"
300
- end
301
- rescue => bang
283
+
284
+ link = page.uri.merge l.uri unless l.href =~ /^http/
285
+ # puts "FOLLOW LINK #{link} ?"
286
+ if follow_link? link
287
+ # puts ">> OK"
288
+ submit_link(link, pagebag.depth)
289
+ else
290
+ # puts ">> NO"
291
+ end
292
+ rescue => bang
302
293
  puts bang
303
- puts bang.backtrace if $DEBUG
304
- end
305
- end
306
-
307
- end
308
-
309
-
310
- def submit_link(link, depth)
311
- # @link_keys[link_key(link)] = link
312
-
313
- clk = link_key(link, :clear_values => true)
314
- @link_counts[clk] ||= 0
315
- @link_counts[clk] += 1
316
- lk = link_key(link)
317
- return false if @link_keys.has_key? lk
318
- @link_keys[lk] = nil
319
- if @link_counts[clk] < @opts[:max_repeat]
320
- @link_queue.enq LinkBag.new(link, depth)
321
- else
322
- puts "! MAX REPEAT !\nSkipped link #{link}" if $DEBUG
323
- end
324
- end
325
-
326
- def form_key(form, opts={} )
327
- o = { :clear_values => false }
328
- o.update opts
329
-
330
- fp = "#{form.action}"
331
- fp << form.method
332
- if form.request_data =~ /=/
333
- data = form.request_data.split("&").sort.join("&")
334
- if o[:clear_values]
335
- fp << data.gsub(/=[^&]*/,'=')
336
- else
337
- fp << data
338
- end
339
- end
340
- fkey = Digest::MD5.hexdigest fp
341
- fkey
342
- end
343
-
344
- def send_form(form, depth)
345
- return false if @engine_status == CRAWL_NONE
346
- cfk = form_key(form, :clear_values => true)
347
- @form_counts[cfk] ||= 0
348
- @form_counts[cfk] += 1
349
-
350
- # @form_keys[form_key(form)] = form
351
- fk = form_key(form)
352
- return false if @form_keys.has_key? fk
353
- @form_keys[fk] = nil
354
- begin
355
- if @form_counts[cfk] < @opts[:max_repeat]
356
- if form.buttons.length > 0
357
- p = form.click_button
358
- else
359
- p = form.submit()
360
- end
361
- puts p.class
362
- @page_queue.enq PageBag.new(p, depth+1)
363
- else
364
- puts "! MAX REPEAT !\nSkipped Form #{form.action}"
365
- end
366
- rescue => bang
367
- puts bang
368
- puts bang.backtrace
369
- end
370
- end
371
-
372
- def send_form?(form)
373
- # puts "SEND FORM?"
374
- return false unless engine_running?
375
- return false unless @opts[:submit_forms] == true
376
- # puts "> submit_forms"
377
- return false unless allowed? form.action
378
- #puts "> allowed"
379
- return false unless fields_allowed? form
380
- #puts "> fields allowed"
381
- return false if form_sent? form
382
- # puts "> form not sent"
383
- return true
384
- end
385
-
386
- def follow_link?(link)
387
- return false unless allowed? link
388
- return false if link_is_followed? link
389
- return true
390
- end
391
-
392
- def host_allowed?(uri)
393
- #puts "ALLOWED HOSTS =>"
394
- #puts allowed_hosts
395
- #puts "---"
396
- # puts "Host Allowed?"
397
- ah = allowed_hosts
398
- # puts ah.class
399
- #puts ah
400
- return false if ah.empty?
401
- ahc = ah.select{ |h| uri.site =~ /^#{h}$/ }.length
402
- if ahc > 0
403
- # puts "> Host IS allowed!"
404
- return true
405
- end
406
- # puts "> Host is NOT allowed!"
407
- return false
408
- end
409
-
410
- def url_allowed?(uri)
411
- # puts "* excluded_urls"
412
- # puts exluded_urls
413
- return false if excluded_urls.select{ |url| uri.path =~ /#{url}/ }.length > 0
414
- # puts "* allowed_urls"
415
- # puts allowed_urls
416
- return true if allowed_urls.empty?
417
- return true if allowed_urls.select{ |url| uri.path =~ /#{url}/ }.length > 0
418
- # puts "> URL is NOT allowed"
419
- return false
420
- end
421
-
422
- def path_allowed?(uri)
423
- return true if root_path.nil?
424
- return true if root_path.empty?
425
- return true if uri.path =~ /^#{root_path}/
426
- # puts "> PATH is NOT ALLOWED"
427
- return false
428
- end
429
-
430
- def cleanup_uri(obj)
431
- uri = nil
432
- uri = obj.uri if obj.respond_to? :uri
433
- uri = URI.parse(obj) if obj.is_a? String
434
- uri = obj if obj.is_a? URI::HTTP
435
- uri
436
- end
437
-
438
- def allowed?(link)
439
- valid = false
440
- # need to handle different link objects, Mechanize::Page::Link and URIs
441
- uri = nil
442
- uri = link.uri if link.respond_to? :uri
443
- uri = URI.parse(link) if link.is_a? String
444
- uri = link if link.is_a? URI::HTTP
445
-
446
- return false if uri.nil?
447
-
448
- host_allowed?(uri) &&
449
- url_allowed?(uri) &&
450
- path_allowed?(uri)
451
- end
452
-
453
- def form_sent?(form)
454
-
455
- @form_keys.has_key? form_key(form)
456
- end
457
-
458
- def link_key(link, opts={})
459
- o = { :clear_values => false }
460
- o.update opts
461
-
462
- uri = cleanup_uri(link)
463
-
464
- query_sorted = ""
465
- query_sorted = uri.query.split("&").sort.join("&") unless uri.query.nil?
466
-
467
- key = ""
468
- key << uri.scheme
469
- key << uri.site
470
- key << uri.path
471
- key << query_sorted
472
- key.gsub!(/=[^&]*/,'=') if o[:clear_values] == true
473
-
474
- Digest::MD5.hexdigest key
475
- end
476
-
477
- def engine_running?
478
- @status_lock.synchronize do
479
- return false if @engine_status == CRAWL_NONE
480
- return true
481
- end
482
- end
483
-
484
- def link_is_followed?(link)
485
-
486
- return true if @link_keys.has_key? link_key(link)
487
-
488
- false
489
- end
490
-
491
- def fields_allowed?(form)
492
- form.fields.each do |f|
493
- excluded_fields.each do |ef|
494
- return false if f.name =~ /#{ef}/
495
- end
496
- end
497
- return true
498
- end
499
-
500
- def method_missing(name, *args, &block)
501
- # puts "* instance method missing (#{name})"
502
- if name =~ /(.*)=$/
503
- @opts.has_key? $1.to_sym || super
504
- @opts[$1.to_sym] = args[0]
505
- return @opts[$1.to_sym]
506
- else
507
- k = name.to_sym
508
- @opts.has_key? k || super
509
- # puts "Value Found For #{k.to_yaml}"
510
- return @opts[k]
511
-
512
- end
513
- end
514
- end
515
- end
516
-
517
- end
294
+ puts bang.backtrace if $DEBUG
295
+ end
296
+ end
297
+
298
+ end
299
+
300
+
301
+ def submit_link(link, depth)
302
+ # @link_keys[link_key(link)] = link
303
+
304
+ clk = link_key(link, :clear_values => true)
305
+ @link_counts[clk] ||= 0
306
+ @link_counts[clk] += 1
307
+ lk = link_key(link)
308
+ return false if @link_keys.has_key? lk
309
+ @link_keys[lk] = nil
310
+ if @link_counts[clk] < @opts[:max_repeat]
311
+ @link_queue.enq LinkBag.new(link, depth)
312
+ else
313
+ puts "! MAX REPEAT !\nSkipped link #{link}" if $DEBUG
314
+ end
315
+ end
316
+
317
+ def form_key(form, opts={} )
318
+ o = { :clear_values => false }
319
+ o.update opts
320
+
321
+ fp = "#{form.action}"
322
+ fp << form.method
323
+ if form.request_data =~ /=/
324
+ data = form.request_data.split("&").sort.join("&")
325
+ if o[:clear_values]
326
+ fp << data.gsub(/=[^&]*/,'=')
327
+ else
328
+ fp << data
329
+ end
330
+ end
331
+ fkey = Digest::MD5.hexdigest fp
332
+ fkey
333
+ end
334
+
335
+ def send_form(form, depth)
336
+ return false if @engine_status == CRAWL_NONE
337
+ cfk = form_key(form, :clear_values => true)
338
+ @form_counts[cfk] ||= 0
339
+ @form_counts[cfk] += 1
340
+
341
+ # @form_keys[form_key(form)] = form
342
+ fk = form_key(form)
343
+ return false if @form_keys.has_key? fk
344
+ @form_keys[fk] = nil
345
+ begin
346
+ if @form_counts[cfk] < @opts[:max_repeat]
347
+ if form.buttons.length > 0
348
+ p = form.click_button
349
+ else
350
+ p = form.submit()
351
+ end
352
+ puts p.class
353
+ @page_queue.enq PageBag.new(p, depth+1)
354
+ else
355
+ puts "! MAX REPEAT !\nSkipped Form #{form.action}"
356
+ end
357
+ rescue => bang
358
+ puts bang
359
+ puts bang.backtrace
360
+ end
361
+ end
362
+
363
+ def send_form?(form)
364
+ # puts "SEND FORM?"
365
+ return false unless engine_running?
366
+ return false unless @opts[:submit_forms] == true
367
+ # puts "> submit_forms"
368
+ return false unless allowed? form.action
369
+ #puts "> allowed"
370
+ return false unless fields_allowed? form
371
+ #puts "> fields allowed"
372
+ return false if form_sent? form
373
+ # puts "> form not sent"
374
+ return true
375
+ end
376
+
377
+ def follow_link?(link)
378
+ return false unless allowed? link
379
+ return false if link_is_followed? link
380
+ return true
381
+ end
382
+
383
+ def host_allowed?(uri)
384
+ #puts "ALLOWED HOSTS =>"
385
+ #puts allowed_hosts
386
+ #puts "---"
387
+ # puts "Host Allowed?"
388
+ ah = allowed_hosts
389
+ # puts ah.class
390
+ #puts ah
391
+ return false if ah.empty?
392
+ ahc = ah.select{ |h| uri.site =~ /^#{h}$/ }.length
393
+ if ahc > 0
394
+ # puts "> Host IS allowed!"
395
+ return true
396
+ end
397
+ # puts "> Host is NOT allowed!"
398
+ return false
399
+ end
400
+
401
+ def url_allowed?(uri)
402
+ # puts "* excluded_urls"
403
+ # puts exluded_urls
404
+ return false if excluded_urls.select{ |url| uri.path =~ /#{url}/ }.length > 0
405
+ # puts "* allowed_urls"
406
+ # puts allowed_urls
407
+ return true if allowed_urls.empty?
408
+ return true if allowed_urls.select{ |url| uri.path =~ /#{url}/ }.length > 0
409
+ # puts "> URL is NOT allowed"
410
+ return false
411
+ end
412
+
413
+ def path_allowed?(uri)
414
+ return true if root_path.nil?
415
+ return true if root_path.empty?
416
+ return true if uri.path =~ /^#{root_path}/
417
+ # puts "> PATH is NOT ALLOWED"
418
+ return false
419
+ end
420
+
421
+ def cleanup_uri(obj)
422
+ uri = nil
423
+ uri = obj.uri if obj.respond_to? :uri
424
+ uri = URI.parse(obj) if obj.is_a? String
425
+ uri = obj if obj.is_a? URI::HTTP
426
+ uri
427
+ end
428
+
429
+ def allowed?(link)
430
+ valid = false
431
+ # need to handle different link objects, Mechanize::Page::Link and URIs
432
+ uri = nil
433
+ uri = link.uri if link.respond_to? :uri
434
+ uri = URI.parse(link) if link.is_a? String
435
+ uri = link if link.is_a? URI::HTTP
436
+
437
+ return false if uri.nil?
438
+
439
+ host_allowed?(uri) &&
440
+ url_allowed?(uri) &&
441
+ path_allowed?(uri)
442
+ end
443
+
444
+ def form_sent?(form)
445
+
446
+ @form_keys.has_key? form_key(form)
447
+ end
448
+
449
+ def link_key(link, opts={})
450
+ o = { :clear_values => false }
451
+ o.update opts
452
+
453
+ uri = cleanup_uri(link)
454
+
455
+ query_sorted = ""
456
+ query_sorted = uri.query.split("&").sort.join("&") unless uri.query.nil?
457
+
458
+ key = ""
459
+ key << uri.scheme
460
+ key << uri.site
461
+ key << uri.path
462
+ key << query_sorted
463
+ key.gsub!(/=[^&]*/,'=') if o[:clear_values] == true
464
+
465
+ Digest::MD5.hexdigest key
466
+ end
467
+
468
+ def engine_running?
469
+ @status_lock.synchronize do
470
+ return false if @engine_status == CRAWL_NONE
471
+ return true
472
+ end
473
+ end
474
+
475
+ def link_is_followed?(link)
476
+
477
+ return true if @link_keys.has_key? link_key(link)
478
+
479
+ false
480
+ end
481
+
482
+ def fields_allowed?(form)
483
+ form.fields.each do |f|
484
+ excluded_fields.each do |ef|
485
+ return false if f.name =~ /#{ef}/
486
+ end
487
+ end
488
+ return true
489
+ end
490
+
491
+ def method_missing(name, *args, &block)
492
+ # puts "* instance method missing (#{name})"
493
+ if name =~ /(.*)=$/
494
+ @opts.has_key? $1.to_sym || super
495
+ @opts[$1.to_sym] = args[0]
496
+ return @opts[$1.to_sym]
497
+ else
498
+ k = name.to_sym
499
+ @opts.has_key? k || super
500
+ # puts "Value Found For #{k.to_yaml}"
501
+ return @opts[k]
502
+
503
+ end
504
+ end
505
+ end
506
+ end
507
+
508
+ end