watobo 0.9.19 → 0.9.20

Sign up to get free protection for your applications and to get access to all the features.
Files changed (266) hide show
  1. data/CHANGELOG.md +104 -0
  2. data/bin/nfq_server.rb +8 -20
  3. data/bin/watobo_gui.rb +8 -20
  4. data/config/forwarding_proxy.yml +2 -2
  5. data/lib/watobo.rb +12 -22
  6. data/lib/watobo/adapters.rb +12 -24
  7. data/lib/watobo/adapters/data_store.rb +76 -66
  8. data/lib/watobo/adapters/file/file_store.rb +295 -307
  9. data/lib/watobo/adapters/session_store.rb +13 -25
  10. data/lib/watobo/ca.rb +9 -21
  11. data/lib/watobo/config.rb +205 -217
  12. data/lib/watobo/constants.rb +8 -20
  13. data/lib/watobo/core.rb +11 -23
  14. data/lib/watobo/core/active_check.rb +11 -21
  15. data/lib/watobo/core/active_checks.rb +57 -69
  16. data/lib/watobo/core/ca.rb +388 -398
  17. data/lib/watobo/core/cert_store.rb +42 -54
  18. data/lib/watobo/core/chat.rb +100 -112
  19. data/lib/watobo/core/chats.rb +271 -275
  20. data/lib/watobo/core/client_cert_store.rb +33 -45
  21. data/lib/watobo/core/conversation.rb +56 -68
  22. data/lib/watobo/core/cookie.rb +31 -43
  23. data/lib/watobo/core/finding.rb +74 -86
  24. data/lib/watobo/core/findings.rb +113 -125
  25. data/lib/watobo/core/forwarding_proxy.rb +44 -35
  26. data/lib/watobo/core/fuzz_gen.rb +8 -20
  27. data/lib/watobo/core/intercept_carver.rb +176 -188
  28. data/lib/watobo/core/intercept_filter.rb +243 -255
  29. data/lib/watobo/core/interceptor.rb +106 -118
  30. data/lib/watobo/core/min_class.rb +12 -24
  31. data/lib/watobo/core/netfilter_queue.rb +178 -190
  32. data/lib/watobo/core/ott_cache.rb +152 -148
  33. data/lib/watobo/core/parameter.rb +53 -58
  34. data/lib/watobo/core/passive_check.rb +8 -20
  35. data/lib/watobo/core/passive_checks.rb +56 -68
  36. data/lib/watobo/core/passive_scanner.rb +54 -66
  37. data/lib/watobo/core/plugin.rb +19 -31
  38. data/lib/watobo/core/project.rb +8 -20
  39. data/lib/watobo/core/proxy.rb +51 -63
  40. data/lib/watobo/core/request.rb +128 -120
  41. data/lib/watobo/core/response.rb +59 -61
  42. data/lib/watobo/core/scanner.rb +8 -20
  43. data/lib/watobo/core/scanner3.rb +413 -425
  44. data/lib/watobo/core/scope.rb +91 -103
  45. data/lib/watobo/core/session.rb +109 -87
  46. data/lib/watobo/core/sid_cache.rb +106 -118
  47. data/lib/watobo/core/subscriber.rb +33 -45
  48. data/lib/watobo/defaults.rb +29 -41
  49. data/lib/watobo/external/diff/lcs.rb +8 -20
  50. data/lib/watobo/external/diff/lcs/array.rb +8 -20
  51. data/lib/watobo/external/diff/lcs/block.rb +8 -20
  52. data/lib/watobo/external/diff/lcs/callbacks.rb +8 -20
  53. data/lib/watobo/external/diff/lcs/change.rb +8 -20
  54. data/lib/watobo/external/diff/lcs/hunk.rb +8 -20
  55. data/lib/watobo/external/diff/lcs/ldiff.rb +8 -20
  56. data/lib/watobo/external/diff/lcs/string.rb +8 -20
  57. data/lib/watobo/externals.rb +14 -26
  58. data/lib/watobo/framework.rb +12 -24
  59. data/lib/watobo/framework/create_project.rb +68 -80
  60. data/lib/watobo/framework/init.rb +8 -20
  61. data/lib/watobo/framework/init_modules.rb +8 -20
  62. data/lib/watobo/framework/license_text.rb +36 -48
  63. data/lib/watobo/framework/load_chat.rb +21 -33
  64. data/lib/watobo/gui.rb +121 -133
  65. data/lib/watobo/gui/about_watobo.rb +8 -20
  66. data/lib/watobo/gui/browser_preview.rb +8 -20
  67. data/lib/watobo/gui/certificate_dialog.rb +8 -20
  68. data/lib/watobo/gui/chat_diff.rb +11 -21
  69. data/lib/watobo/gui/chatviewer_frame.rb +10 -22
  70. data/lib/watobo/gui/checkboxtree.rb +8 -20
  71. data/lib/watobo/gui/checks_policy_frame.rb +8 -20
  72. data/lib/watobo/gui/client_cert_dialog.rb +10 -21
  73. data/lib/watobo/gui/confirm_scan_dialog.rb +8 -20
  74. data/lib/watobo/gui/conversation_table.rb +54 -44
  75. data/lib/watobo/gui/conversation_table_ctrl.rb +215 -227
  76. data/lib/watobo/gui/conversation_table_ctrl2.rb +385 -393
  77. data/lib/watobo/gui/csrf_token_dialog.rb +11 -25
  78. data/lib/watobo/gui/custom_viewer.rb +357 -369
  79. data/lib/watobo/gui/dashboard.rb +8 -20
  80. data/lib/watobo/gui/define_scope_frame.rb +8 -20
  81. data/lib/watobo/gui/differ_frame.rb +223 -235
  82. data/lib/watobo/gui/edit_comment.rb +8 -20
  83. data/lib/watobo/gui/edit_scope_dialog.rb +8 -20
  84. data/lib/watobo/gui/export_dialog.rb +114 -0
  85. data/lib/watobo/gui/finding_info.rb +9 -21
  86. data/lib/watobo/gui/findings_tree.rb +8 -20
  87. data/lib/watobo/gui/full_scan_dialog.rb +8 -20
  88. data/lib/watobo/gui/fuzzer_gui.rb +8 -20
  89. data/lib/watobo/gui/goto_url_dialog.rb +78 -90
  90. data/lib/watobo/gui/hex_viewer.rb +25 -27
  91. data/lib/watobo/gui/html_viewer.rb +295 -307
  92. data/lib/watobo/gui/intercept_filter_dialog.rb +196 -208
  93. data/lib/watobo/gui/interceptor_gui.rb +1046 -1041
  94. data/lib/watobo/gui/interceptor_settings_dialog.rb +8 -20
  95. data/lib/watobo/gui/list_box.rb +109 -121
  96. data/lib/watobo/gui/log_file_viewer.rb +40 -52
  97. data/lib/watobo/gui/log_viewer.rb +87 -99
  98. data/lib/watobo/gui/login_wizzard.rb +8 -20
  99. data/lib/watobo/gui/main_window.rb +34 -33
  100. data/lib/watobo/gui/manual_request_editor.rb +25 -35
  101. data/lib/watobo/gui/master_pw_dialog.rb +8 -20
  102. data/lib/watobo/gui/mixins/gui_settings.rb +37 -49
  103. data/lib/watobo/gui/page_tree.rb +225 -237
  104. data/lib/watobo/gui/password_policy_dialog.rb +8 -20
  105. data/lib/watobo/gui/plugin_board.rb +8 -20
  106. data/lib/watobo/gui/preferences_dialog.rb +8 -20
  107. data/lib/watobo/gui/progress_window.rb +8 -20
  108. data/lib/watobo/gui/project_wizzard.rb +8 -20
  109. data/lib/watobo/gui/proxy_dialog.rb +117 -85
  110. data/lib/watobo/gui/quick_scan_dialog.rb +8 -20
  111. data/lib/watobo/gui/request_builder_frame.rb +125 -122
  112. data/lib/watobo/gui/request_editor.rb +53 -28
  113. data/lib/watobo/gui/rewrite_filters_dialog.rb +402 -414
  114. data/lib/watobo/gui/rewrite_rules_dialog.rb +380 -392
  115. data/lib/watobo/gui/save_chat_dialog.rb +148 -160
  116. data/lib/watobo/gui/scanner_settings_dialog.rb +8 -20
  117. data/lib/watobo/gui/select_chat_dialog.rb +8 -20
  118. data/lib/watobo/gui/session_management_dialog.rb +8 -20
  119. data/lib/watobo/gui/sites_tree.rb +118 -22
  120. data/lib/watobo/gui/status_bar.rb +8 -20
  121. data/lib/watobo/gui/table_editor.rb +76 -53
  122. data/lib/watobo/gui/tagless_viewer.rb +10 -21
  123. data/lib/watobo/gui/templates/plugin.rb +8 -20
  124. data/lib/watobo/gui/templates/plugin2.rb +99 -111
  125. data/lib/watobo/gui/templates/plugin_base.rb +152 -164
  126. data/lib/watobo/gui/text_viewer.rb +8 -20
  127. data/lib/watobo/gui/transcoder_window.rb +15 -22
  128. data/lib/watobo/gui/utils/gui_utils.rb +8 -20
  129. data/lib/watobo/gui/utils/init_icons.rb +94 -106
  130. data/lib/watobo/gui/utils/load_icons.rb +41 -53
  131. data/lib/watobo/gui/utils/load_plugins.rb +118 -130
  132. data/lib/watobo/gui/utils/master_password.rb +76 -88
  133. data/lib/watobo/gui/utils/save_default_settings.rb +121 -133
  134. data/lib/watobo/gui/utils/save_project_settings.rb +8 -20
  135. data/lib/watobo/gui/utils/save_proxy_settings.rb +53 -21
  136. data/lib/watobo/gui/utils/save_scanner_settings.rb +26 -38
  137. data/lib/watobo/gui/utils/session_history.rb +120 -132
  138. data/lib/watobo/gui/workspace_dialog.rb +8 -20
  139. data/lib/watobo/gui/www_auth_dialog.rb +8 -20
  140. data/lib/watobo/gui/xml_viewer_frame.rb +8 -20
  141. data/lib/watobo/http.rb +12 -23
  142. data/lib/watobo/http/cookies/cookies.rb +63 -70
  143. data/lib/watobo/http/data/data.rb +56 -64
  144. data/lib/watobo/http/data/json.rb +51 -0
  145. data/lib/watobo/http/url/url.rb +46 -58
  146. data/lib/watobo/http/xml/xml.rb +129 -141
  147. data/lib/watobo/interceptor.rb +11 -23
  148. data/lib/watobo/interceptor/proxy.rb +624 -625
  149. data/lib/watobo/interceptor/transparent.rb +22 -34
  150. data/lib/watobo/mixins.rb +18 -30
  151. data/lib/watobo/mixins/check_info.rb +35 -47
  152. data/lib/watobo/mixins/httpparser.rb +42 -35
  153. data/lib/watobo/mixins/request_parser.rb +8 -20
  154. data/lib/watobo/mixins/shapers.rb +484 -477
  155. data/lib/watobo/mixins/transcoders.rb +8 -20
  156. data/lib/watobo/parser.rb +9 -21
  157. data/lib/watobo/parser/html.rb +91 -103
  158. data/lib/watobo/sockets.rb +11 -23
  159. data/lib/watobo/sockets/agent.rb +836 -848
  160. data/lib/watobo/sockets/client_socket.rb +283 -277
  161. data/lib/watobo/sockets/connection.rb +409 -421
  162. data/lib/watobo/sockets/http_socket.rb +16 -23
  163. data/lib/watobo/sockets/ntlm_auth.rb +137 -149
  164. data/lib/watobo/utils.rb +18 -30
  165. data/lib/watobo/utils/check_regex.rb +8 -20
  166. data/lib/watobo/utils/copy_object.rb +8 -20
  167. data/lib/watobo/utils/crypto.rb +8 -20
  168. data/lib/watobo/utils/expand_range.rb +31 -43
  169. data/lib/watobo/utils/export_xml.rb +108 -0
  170. data/lib/watobo/utils/file_management.rb +8 -20
  171. data/lib/watobo/utils/hexprint.rb +17 -29
  172. data/lib/watobo/utils/load_chat.rb +8 -20
  173. data/lib/watobo/utils/load_icon.rb +8 -20
  174. data/lib/watobo/{external/ntlm → utils}/ntlm.rb +874 -796
  175. data/lib/watobo/utils/print_debug.rb +20 -32
  176. data/lib/watobo/utils/response_builder.rb +98 -110
  177. data/lib/watobo/utils/response_hash.rb +9 -20
  178. data/lib/watobo/utils/secure_eval.rb +10 -22
  179. data/lib/watobo/utils/strings.rb +18 -30
  180. data/lib/watobo/utils/text2request.rb +12 -20
  181. data/lib/watobo/utils/url.rb +31 -43
  182. data/lib/watobo/utils/utf16.rb +22 -0
  183. data/modules/active/Apache/mod_status.rb +9 -0
  184. data/modules/active/Apache/multiview.rb +161 -0
  185. data/modules/active/Flash/crossdomain.rb +9 -0
  186. data/modules/active/directories/dirwalker.rb +8 -20
  187. data/modules/active/discovery/fileextensions.rb +10 -22
  188. data/modules/active/discovery/http_methods.rb +8 -20
  189. data/modules/active/domino/domino_db.rb +8 -20
  190. data/modules/active/dotNET/custom_errors.rb +110 -122
  191. data/modules/active/dotNET/dotnet_files.rb +98 -110
  192. data/modules/active/fileinclusion/lfi_simple.rb +8 -20
  193. data/modules/active/jboss/jboss_basic.rb +8 -20
  194. data/modules/active/sap/business_objects.rb +63 -0
  195. data/modules/active/sap/its_commands.rb +8 -20
  196. data/modules/active/sap/its_service_parameter.rb +8 -20
  197. data/modules/active/sap/its_services.rb +8 -20
  198. data/modules/active/sap/its_xss.rb +8 -20
  199. data/modules/active/shell_shock/shell_shock.rb +149 -0
  200. data/modules/active/siebel/siebel_apps.rb +168 -180
  201. data/modules/active/sqlinjection/sql_boolean.rb +9 -21
  202. data/modules/active/sqlinjection/sqli_error.rb +10 -22
  203. data/modules/active/sqlinjection/sqli_timing.rb +228 -240
  204. data/modules/active/struts2/default_handler_ognl.rb +114 -126
  205. data/modules/active/struts2/include_params_ognl.rb +113 -125
  206. data/modules/active/xml/xml_xxe.rb +122 -127
  207. data/modules/active/xss/xss_ng.rb +223 -234
  208. data/modules/active/xss/xss_simple.rb +8 -20
  209. data/modules/passive/ajax.rb +76 -84
  210. data/modules/passive/autocomplete.rb +64 -76
  211. data/modules/passive/cookie_options.rb +8 -20
  212. data/modules/passive/cookie_xss.rb +9 -21
  213. data/modules/passive/detect_code.rb +9 -21
  214. data/modules/passive/detect_fileupload.rb +11 -22
  215. data/modules/passive/detect_infrastructure.rb +23 -35
  216. data/modules/passive/detect_one_time_tokens.rb +8 -20
  217. data/modules/passive/dirindexing.rb +9 -21
  218. data/modules/passive/disclosure_domino.rb +66 -79
  219. data/modules/passive/disclosure_emails.rb +9 -21
  220. data/modules/passive/disclosure_ipaddr.rb +15 -23
  221. data/modules/passive/filename_as_parameter.rb +8 -20
  222. data/modules/passive/form_spotter.rb +15 -21
  223. data/modules/passive/hidden_fields.rb +64 -70
  224. data/modules/passive/hotspots.rb +13 -22
  225. data/modules/passive/in_script_parameter.rb +15 -24
  226. data/modules/passive/multiple_server_headers.rb +8 -20
  227. data/modules/passive/possible_login.rb +12 -23
  228. data/modules/passive/redirect_url.rb +10 -22
  229. data/modules/passive/redirectionz.rb +9 -21
  230. data/modules/passive/sap-headers.rb +64 -76
  231. data/modules/passive/xss_dom.rb +10 -21
  232. data/plugins/catalog/catalog.rb +17 -23
  233. data/plugins/crawler/crawler.rb +12 -24
  234. data/plugins/crawler/gui.rb +13 -25
  235. data/plugins/crawler/gui/auth_frame.rb +278 -290
  236. data/plugins/crawler/gui/crawler_gui.rb +302 -320
  237. data/plugins/crawler/gui/general_settings_frame.rb +104 -116
  238. data/plugins/crawler/gui/hooks_frame.rb +88 -100
  239. data/plugins/crawler/gui/scope_frame.rb +58 -70
  240. data/plugins/crawler/gui/settings_tabbook.rb +46 -58
  241. data/plugins/crawler/gui/status_frame.rb +67 -78
  242. data/plugins/crawler/lib/bags.rb +26 -38
  243. data/plugins/crawler/lib/constants.rb +19 -31
  244. data/plugins/crawler/lib/engine.rb +505 -508
  245. data/plugins/crawler/lib/grabber.rb +77 -87
  246. data/plugins/crawler/lib/status.rb +82 -0
  247. data/plugins/crawler/lib/uri_mp.rb +20 -32
  248. data/plugins/filefinder/dbs/siebel_paths.txt +1118 -0
  249. data/plugins/filefinder/dbs/subs-big.lst +31986 -0
  250. data/plugins/filefinder/filefinder.rb +13 -23
  251. data/plugins/sqlmap/bin/test.rb +86 -98
  252. data/plugins/sqlmap/gui.rb +12 -24
  253. data/plugins/sqlmap/gui/main.rb +226 -238
  254. data/plugins/sqlmap/gui/options_frame.rb +105 -117
  255. data/plugins/sqlmap/lib/sqlmap_ctrl.rb +103 -115
  256. data/plugins/sqlmap/sqlmap.rb +10 -22
  257. data/plugins/sslchecker/cli/sslchecker_cli.rb +8 -20
  258. data/plugins/sslchecker/gui/cipher_table.rb +252 -264
  259. data/plugins/sslchecker/gui/gui.rb +267 -276
  260. data/plugins/sslchecker/gui/sslchecker.rb +12 -24
  261. data/plugins/sslchecker/lib/check.rb +172 -80
  262. data/plugins/wshell/gui/main.rb +115 -127
  263. data/plugins/wshell/lib/core.rb +85 -97
  264. data/plugins/wshell/wshell.rb +19 -31
  265. metadata +14 -6
  266. data/.yardopts +0 -24
@@ -1,61 +1,49 @@
1
- # .
1
+ #.
2
2
  # settings_tabbook.rb
3
- #
4
- # Copyright 2013 by siberas, http://www.siberas.de
5
- #
6
- # This file is part of WATOBO (Web Application Tool Box)
7
- # http://watobo.sourceforge.com
8
- #
9
- # WATOBO is free software; you can redistribute it and/or modify
10
- # it under the terms of the GNU General Public License as published by
11
- # the Free Software Foundation version 2 of the License.
12
- #
13
- # WATOBO is distributed in the hope that it will be useful,
14
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
- # GNU General Public License for more details.
17
- #
18
- # You should have received a copy of the GNU General Public License
19
- # along with WATOBO; if not, write to the Free Software
20
- # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21
- # .
22
- # @private
23
- module Watobo#:nodoc: all
24
- module Plugin
25
- module Crawler
26
- class Gui
27
- class SettingsTabBook < FXTabBook
28
- attr :hooks, :general, :log_viewer, :auth, :scope
29
-
30
-
31
-
32
- def initialize(owner)
33
- #@tab = FXTabBook.new(self, nil, 0, LAYOUT_FILL_X|LAYOUT_FILL_Y|LAYOUT_RIGHT)
34
- super(owner, nil, 0, LAYOUT_FILL_X|LAYOUT_FILL_Y|LAYOUT_RIGHT)
35
- FXTabItem.new(self, "General", nil)
36
- # frame = FXVerticalFrame.new(self, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_RAISED)
37
- @general = GeneralSettingsFrame.new(self)
38
-
39
- FXTabItem.new(self, "Scope", nil)
40
- @scope = ScopeFrame.new(self)
41
-
42
- FXTabItem.new(self, "Auth", nil)
43
- @auth = AuthFrame.new(self)
3
+ #.
4
+ # Copyright 2014 by siberas, http://www.siberas.de
5
+ # This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
6
+ # WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
7
+ # WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
8
+ # You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
44
9
 
45
-
46
- FXTabItem.new(self, "Hooks", nil)
47
- @hooks = HooksFrame.new(self)
48
-
49
- FXTabItem.new(self, "Log", nil)
50
- frame = FXVerticalFrame.new(self, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_THICK|FRAME_RAISED)
51
- @log_viewer = Watobo::Gui::LogViewer.new(frame, :append, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_SUNKEN)
52
-
53
- self.connect(SEL_COMMAND){
54
- @hooks.selected if self.current == 3
55
- }
56
- end
57
- end
58
- end
59
- end
60
- end
10
+ # @private
11
+ module Watobo#:nodoc: all
12
+ module Plugin
13
+ module Crawler
14
+ class Gui
15
+ class SettingsTabBook < FXTabBook
16
+ attr :hooks, :general, :log_viewer, :auth, :scope
17
+
18
+
19
+
20
+ def initialize(owner)
21
+ #@tab = FXTabBook.new(self, nil, 0, LAYOUT_FILL_X|LAYOUT_FILL_Y|LAYOUT_RIGHT)
22
+ super(owner, nil, 0, LAYOUT_FILL_X|LAYOUT_FILL_Y|LAYOUT_RIGHT)
23
+ FXTabItem.new(self, "General", nil)
24
+ # frame = FXVerticalFrame.new(self, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_RAISED)
25
+ @general = GeneralSettingsFrame.new(self)
26
+
27
+ FXTabItem.new(self, "Scope", nil)
28
+ @scope = ScopeFrame.new(self)
29
+
30
+ FXTabItem.new(self, "Auth", nil)
31
+ @auth = AuthFrame.new(self)
32
+
33
+
34
+ FXTabItem.new(self, "Hooks", nil)
35
+ @hooks = HooksFrame.new(self)
36
+
37
+ FXTabItem.new(self, "Log", nil)
38
+ frame = FXVerticalFrame.new(self, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_THICK|FRAME_RAISED)
39
+ @log_viewer = Watobo::Gui::LogViewer.new(frame, :append, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_SUNKEN)
40
+
41
+ self.connect(SEL_COMMAND){
42
+ @hooks.selected if self.current == 3
43
+ }
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
61
49
  end
@@ -1,82 +1,71 @@
1
- # .
1
+ #.
2
2
  # status_frame.rb
3
- #
4
- # Copyright 2013 by siberas, http://www.siberas.de
5
- #
6
- # This file is part of WATOBO (Web Application Tool Box)
7
- # http://watobo.sourceforge.com
8
- #
9
- # WATOBO is free software; you can redistribute it and/or modify
10
- # it under the terms of the GNU General Public License as published by
11
- # the Free Software Foundation version 2 of the License.
12
- #
13
- # WATOBO is distributed in the hope that it will be useful,
14
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
- # GNU General Public License for more details.
17
- #
18
- # You should have received a copy of the GNU General Public License
19
- # along with WATOBO; if not, write to the Free Software
20
- # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21
- # .
22
- # @private
23
- module Watobo#:nodoc: all
24
- module Plugin
25
- module Crawler
26
- class Gui
27
- class StatusFrame < FXHorizontalFrame
3
+ #.
4
+ # Copyright 2014 by siberas, http://www.siberas.de
5
+ # This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
6
+ # WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
7
+ # WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
8
+ # You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28
9
 
29
- include Watobo::Plugin::Crawler::Constants
30
- # :engine_status => CRAWL_NONE,
31
- # :page_size => 0,
32
- # :link_size => 0,
33
- # :skipped_domains => 0
10
+ # @private
11
+ module Watobo#:nodoc: all
12
+ module Plugin
13
+ module Crawler
14
+ class Gui
15
+ class StatusFrame < FXHorizontalFrame
16
+
17
+ include Watobo::Plugin::Crawler::Constants
18
+ # :engine_status => CRAWL_NONE,
19
+ # :page_size => 0,
20
+ # :link_size => 0,
21
+ # :skipped_domains => 0
34
22
  def update_status(status)
35
- if status.has_key? :engine_status
36
- case status[:engine_status]
37
- when CRAWL_NONE
38
- self.backColor = self.parent.backColor
39
- @status_txt.text = "Status: Idle"
40
- when CRAWL_RUNNING
41
- self.backColor = FXColor::Red
42
- @status_txt.text = "Status: Running"
43
-
44
- when CRAWL_PAUSED
45
- self.backColor = FXColor::Yellow
46
- @status_txt.text = "Status: Paused"
47
- end
48
- end
49
-
50
- if status.has_key? :link_size
51
- @link_size_txt.text = "Links: #{status[:link_size]}"
52
- end
53
-
54
- if status.has_key? :page_size
55
- @page_size_txt.text = "Pages: #{status[:page_size]}"
56
- end
57
-
58
- if status.has_key? :total_requests
59
- @requests_txt.text = "Requests: #{status[:total_requests]}"
60
- end
61
- end
62
-
63
- def initialize(owner)
64
- super(owner, :opts => LAYOUT_FILL_X|FRAME_RAISED)
65
- @info_fields = []
66
- #frame = FXHorizontalFrame.new(, :opts => LAYOUT_FILL_Y, :padding => 0)
67
- frame = self
68
- @info_fields << ( @status_txt = FXLabel.new(frame, "Status: Stopped", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 100) )
69
- @info_fields << (@link_size_txt = FXLabel.new(frame, "Links: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 70) )
70
- @info_fields << (@page_size_txt = FXLabel.new(frame, "Pages: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 70) )
71
- @info_fields << (@requests_txt = FXLabel.new(frame, "Requests: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 100) )
72
-
73
- @info_fields.each do |i|
74
- i.justify = JUSTIFY_LEFT
75
- end
76
- end
77
-
78
- end
79
- end
80
- end
81
- end
23
+ #puts status.to_yaml
24
+ if status.has_key? :engine_status
25
+ case status[:engine_status]
26
+ when CRAWL_NONE
27
+ self.backColor = self.parent.backColor
28
+ @status_txt.text = "Status: Idle"
29
+ when CRAWL_RUNNING
30
+ self.backColor = FXColor::Red
31
+ @status_txt.text = "Status: Running"
32
+
33
+ when CRAWL_PAUSED
34
+ self.backColor = FXColor::Yellow
35
+ @status_txt.text = "Status: Paused"
36
+ end
37
+ end
38
+
39
+ if status.has_key? :link_size
40
+ @link_size_txt.text = "Links: #{status[:link_size]}"
41
+ end
42
+
43
+ if status.has_key? :page_size
44
+ @page_size_txt.text = "Pages: #{status[:page_size]}"
45
+ end
46
+
47
+ if status.has_key? :total_requests
48
+ @requests_txt.text = "Requests: #{status[:total_requests]}"
49
+ end
50
+ end
51
+
52
+ def initialize(owner)
53
+ super(owner, :opts => LAYOUT_FILL_X|FRAME_RAISED)
54
+ @info_fields = []
55
+ #frame = FXHorizontalFrame.new(, :opts => LAYOUT_FILL_Y, :padding => 0)
56
+ frame = self
57
+ @info_fields << ( @status_txt = FXLabel.new(frame, "Status: Stopped", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 100) )
58
+ @info_fields << (@link_size_txt = FXLabel.new(frame, "Links: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 70) )
59
+ @info_fields << (@page_size_txt = FXLabel.new(frame, "Pages: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 70) )
60
+ @info_fields << (@requests_txt = FXLabel.new(frame, "Requests: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 100) )
61
+
62
+ @info_fields.each do |i|
63
+ i.justify = JUSTIFY_LEFT
64
+ end
65
+ end
66
+
67
+ end
68
+ end
69
+ end
70
+ end
82
71
  end
@@ -1,41 +1,29 @@
1
- # .
1
+ #.
2
2
  # bags.rb
3
- #
4
- # Copyright 2013 by siberas, http://www.siberas.de
5
- #
6
- # This file is part of WATOBO (Web Application Tool Box)
7
- # http://watobo.sourceforge.com
8
- #
9
- # WATOBO is free software; you can redistribute it and/or modify
10
- # it under the terms of the GNU General Public License as published by
11
- # the Free Software Foundation version 2 of the License.
12
- #
13
- # WATOBO is distributed in the hope that it will be useful,
14
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
- # GNU General Public License for more details.
17
- #
18
- # You should have received a copy of the GNU General Public License
19
- # along with WATOBO; if not, write to the Free Software
20
- # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21
- # .
22
- # @private
23
- module Watobo#:nodoc: all
24
- module Crawler
25
- class PageBag
26
- attr :page, :depth
27
- def initialize(page, depth)
28
- @page = page
29
- @depth = depth
30
- end
31
- end
3
+ #.
4
+ # Copyright 2014 by siberas, http://www.siberas.de
5
+ # This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
6
+ # WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
7
+ # WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
8
+ # You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
32
9
 
33
- class LinkBag
34
- attr :link, :depth
35
- def initialize(link, depth)
36
- @link = link
37
- @depth = depth
38
- end
39
- end
40
- end
10
+ # @private
11
+ module Watobo#:nodoc: all
12
+ module Crawler
13
+ class PageBag
14
+ attr :page, :depth
15
+ def initialize(page, depth)
16
+ @page = page
17
+ @depth = depth
18
+ end
19
+ end
20
+
21
+ class LinkBag
22
+ attr :link, :depth
23
+ def initialize(link, depth)
24
+ @link = link
25
+ @depth = depth
26
+ end
27
+ end
28
+ end
41
29
  end
@@ -1,34 +1,22 @@
1
- # .
1
+ #.
2
2
  # constants.rb
3
- #
4
- # Copyright 2013 by siberas, http://www.siberas.de
5
- #
6
- # This file is part of WATOBO (Web Application Tool Box)
7
- # http://watobo.sourceforge.com
8
- #
9
- # WATOBO is free software; you can redistribute it and/or modify
10
- # it under the terms of the GNU General Public License as published by
11
- # the Free Software Foundation version 2 of the License.
12
- #
13
- # WATOBO is distributed in the hope that it will be useful,
14
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
- # GNU General Public License for more details.
17
- #
18
- # You should have received a copy of the GNU General Public License
19
- # along with WATOBO; if not, write to the Free Software
20
- # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21
- # .
3
+ #.
4
+ # Copyright 2014 by siberas, http://www.siberas.de
5
+ # This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
6
+ # WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
7
+ # WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
8
+ # You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
9
+
22
10
  # @private
23
- module Watobo#:nodoc: all
24
- module Plugin
25
- module Crawler
26
- module Constants
27
- CRAWL_NONE = 0x00
28
- CRAWL_RUNNING = 0x01
29
- CRAWL_PAUSED = 0x02
30
-
31
- end
32
- end
33
- end
11
+ module Watobo#:nodoc: all
12
+ module Plugin
13
+ module Crawler
14
+ module Constants
15
+ CRAWL_NONE = 0x00
16
+ CRAWL_RUNNING = 0x01
17
+ CRAWL_PAUSED = 0x02
18
+
19
+ end
20
+ end
21
+ end
34
22
  end
@@ -1,520 +1,517 @@
1
- # .
1
+ #.
2
2
  # engine.rb
3
- #
4
- # Copyright 2013 by siberas, http://www.siberas.de
5
- #
6
- # This file is part of WATOBO (Web Application Tool Box)
7
- # http://watobo.sourceforge.com
8
- #
9
- # WATOBO is free software; you can redistribute it and/or modify
10
- # it under the terms of the GNU General Public License as published by
11
- # the Free Software Foundation version 2 of the License.
12
- #
13
- # WATOBO is distributed in the hope that it will be useful,
14
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
- # GNU General Public License for more details.
17
- #
18
- # You should have received a copy of the GNU General Public License
19
- # along with WATOBO; if not, write to the Free Software
20
- # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21
- # .
22
- # @private
23
- module Watobo#:nodoc: all
24
- module Crawler
25
-
26
- class Agent < Mechanize
27
-
28
- def initialize(opts)
29
- super()
30
-
31
-
32
- self.verify_mode = OpenSSL::SSL::VERIFY_NONE
33
- self.ignore_bad_chunking = true
34
- self.keep_alive = false
35
-
36
- self.user_agent = opts[:user_agent] if opts.has_key?(:user_agent)
37
-
38
- if opts.has_key? :username and opts.has_key? :password
39
- unless opts[:username].empty? and opts[:password].empty?
40
-
41
- user = opts[:username]
42
- pw = opts[:password]
43
- uri = opts[:auth_uri]
44
- # puts "Got Credentials for #{uri}: #{user} / #{pw}"
45
- self.add_auth(uri, user , pw )
46
- # TODO: remove this workaround for a Mechanize Bug (#243)
47
- p = self.get uri
48
- end
49
- end
50
-
51
- if ( opts.has_key? :proxy_host ) && ( opts.has_key? :proxy_port )
52
- self.set_proxy( opts[:proxy_host], opts[:proxy_port] )
53
- end
54
-
55
- if opts.has_key? :pre_connect_hook
56
- self.pre_connect_hooks << opts[:pre_connect_hook] if opts[:pre_connect_hook].respond_to? :call
57
- end
58
-
59
- unless opts[:cookie_jar].nil?
60
- clean_jar = Mechanize::CookieJar.new
61
- opts[:cookie_jar].each{ |cookie|
62
- clean_jar.add! cookie
63
- }
64
- self.cookie_jar = clean_jar
65
- end
66
-
67
- end
68
-
69
- end
70
-
71
- class Engine
72
- include Watobo::Plugin::Crawler::Constants
73
-
74
- def subscribe(event, &callback)
75
- (@event_dispatcher_listeners[event] ||= []) << callback
76
- end
77
-
78
- def clearEvents(event)
79
- @event_dispatcher_listeners[event] ||= []
80
- @event_dispatcher_listeners[event].clear
81
- end
82
-
83
- def notify(event, *args)
84
- if @event_dispatcher_listeners[event]
85
- # puts "NOTIFY: #{self}(:#{event}) [#{@event_dispatcher_listeners[event].length}]" if $DEBUG
86
- @event_dispatcher_listeners[event].each do |m|
87
- m.call(*args) if m.respond_to? :call
88
- end
89
- end
90
- end
91
-
92
- def settings
93
- @opts
94
- end
95
-
96
-
97
-
98
- def get_page(url, opts={})
99
- ro = {}.update @opts
100
- ro.update opts
101
- agent = Crawler::Agent.new(ro)
102
- page = nil
103
- page = agent.get url
104
- return agent, page
105
- end
106
-
107
- def initialize(opts={})
108
- @event_dispatcher_listeners = Hash.new
109
- @status_lock = Mutex.new
110
-
111
- @opts = {
112
- :submit_forms => true,
113
- :max_depth => 5,
114
- :max_repeat => 20,
115
- :max_threads => 4,
116
- :user_agent => "watobo-crawler",
117
- :proxy_host => '127.0.0.1',
118
- :proxy_port => Watobo::Conf::Interceptor.port,
119
- :delay => 0,
120
- :head_request_pattern => '(pdf|swf|doc|flv|jpg|png|gif)',
121
- :allowed_hosts => [], # regex's
122
- :allowed_urls => [], # regex's
123
- :excluded_urls => ["logout"], # regex's
124
- :excluded_fields => ["userid","username","password"], # regex's'
125
- :excluded_form_names => [], # regex's'
126
- :root_path => "", # regex
127
- :username => "",
128
- :password => "",
129
- :auth_uri => nil,
130
- :auth_domain => "", # for ntlm auth
131
- :cookie_jar => nil
132
- }
133
-
134
- @opts.update opts
135
- @opts[:head_request_pattern] = '' if @opts[:head_request_pattern].nil?
136
-
137
- @stats = {
138
- :total_requests => 0
139
- }
140
-
141
- @link_keys = Hash.new
142
- @link_counts = Hash.new
143
-
144
- @form_keys = Hash.new
145
- @form_counts = Hash.new
146
-
147
- end
148
-
149
- def pause
150
- false
151
- end
152
-
153
- def cancel
154
- puts "[CRAWLER] - CANCEL!!"
155
- @status_lock.synchronize do
156
- @engine_status = CRAWL_NONE
157
- end
158
- @grabber_threads.each do |gt|
159
- puts "Killing Thread #{gt}"
160
- gt.kill
161
- gt.raise "CANCEL"
162
- end
163
- @grabber_threads.each{|t| t.join }
164
-
165
- @link_queue.clear
166
- @page_queue.clear
167
- @grabber_threads.clear
168
- @link_keys.clear
169
- @link_counts.clear
170
-
171
- @form_keys.clear
172
- @form_counts.clear
173
-
174
- notify( :update_status, current_status )
175
- puts "CANCELED - CANCELED"
176
- # exit
177
- end
3
+ #.
4
+ # Copyright 2014 by siberas, http://www.siberas.de
5
+ # This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
6
+ # WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
7
+ # WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
8
+ # You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
178
9
 
10
+ # @private
11
+ module Watobo#:nodoc: all
12
+ module Crawler
13
+
14
+ class Agent < Mechanize
15
+
16
+ def initialize(opts)
17
+ super()
18
+
19
+
20
+ self.verify_mode = OpenSSL::SSL::VERIFY_NONE
21
+ self.ignore_bad_chunking = true
22
+ self.keep_alive = false
23
+
24
+ self.user_agent = opts[:user_agent] if opts.has_key?(:user_agent)
25
+
26
+ if opts.has_key? :username and opts.has_key? :password
27
+ unless opts[:username].empty? and opts[:password].empty?
28
+
29
+ user = opts[:username]
30
+ pw = opts[:password]
31
+ uri = opts[:auth_uri]
32
+ # puts "Got Credentials for #{uri}: #{user} / #{pw}"
33
+ self.add_auth(uri, user , pw )
34
+ # TODO: remove this workaround for a Mechanize Bug (#243)
35
+ p = self.get uri
36
+ end
37
+ end
38
+
39
+ if ( opts.has_key? :proxy_host ) && ( opts.has_key? :proxy_port )
40
+ self.set_proxy( opts[:proxy_host], opts[:proxy_port] )
41
+ end
42
+
43
+ if opts.has_key? :pre_connect_hook
44
+ self.pre_connect_hooks << opts[:pre_connect_hook] if opts[:pre_connect_hook].respond_to? :call
45
+ end
46
+
47
+ unless opts[:cookie_jar].nil?
48
+ clean_jar = Mechanize::CookieJar.new
49
+ opts[:cookie_jar].each{ |cookie|
50
+ clean_jar.add! cookie
51
+ }
52
+ self.cookie_jar = clean_jar
53
+ end
54
+
55
+ end
56
+
57
+ end
58
+
59
+ class Engine
60
+ include Watobo::Plugin::Crawler::Constants
61
+
62
+ def subscribe(event, &callback)
63
+ (@event_dispatcher_listeners[event] ||= []) << callback
64
+ end
65
+
66
+ def clearEvents(event)
67
+ @event_dispatcher_listeners[event] ||= []
68
+ @event_dispatcher_listeners[event].clear
69
+ end
70
+
71
+ def notify(event, *args)
72
+ if @event_dispatcher_listeners[event]
73
+ # puts "NOTIFY: #{self}(:#{event}) [#{@event_dispatcher_listeners[event].length}]" if $DEBUG
74
+ @event_dispatcher_listeners[event].each do |m|
75
+ m.call(*args) if m.respond_to? :call
76
+ end
77
+ end
78
+ end
79
+
80
+ def settings
81
+ @opts
82
+ end
83
+
84
+
85
+
86
+ def get_page(url, opts={})
87
+ ro = {}.update @opts
88
+ ro.update opts
89
+ agent = Crawler::Agent.new(ro)
90
+ page = nil
91
+ page = agent.get url
92
+ return agent, page
93
+ end
94
+
95
+ def initialize(opts={})
96
+ @event_dispatcher_listeners = Hash.new
97
+ @status_lock = Mutex.new
98
+
99
+ @opts = {
100
+ :submit_forms => true,
101
+ :max_depth => 5,
102
+ :max_repeat => 20,
103
+ :max_threads => 4,
104
+ :user_agent => "watobo-crawler",
105
+ :proxy_host => '127.0.0.1',
106
+ :proxy_port => Watobo::Conf::Interceptor.port,
107
+ :delay => 0,
108
+ :head_request_pattern => '(pdf|swf|doc|flv|jpg|png|gif)',
109
+ :allowed_hosts => [], # regex's
110
+ :allowed_urls => [], # regex's
111
+ :excluded_urls => ["logout"], # regex's
112
+ :excluded_fields => ["userid","username","password"], # regex's'
113
+ :excluded_form_names => [], # regex's'
114
+ :root_path => "", # regex
115
+ :username => "",
116
+ :password => "",
117
+ :auth_uri => nil,
118
+ :auth_domain => "", # for ntlm auth
119
+ :cookie_jar => nil
120
+ }
121
+
122
+ @opts.update opts
123
+ @opts[:head_request_pattern] = '' if @opts[:head_request_pattern].nil?
124
+
125
+ @stats = {
126
+ :total_requests => 0
127
+ }
128
+
129
+ @link_keys = Hash.new
130
+ @link_counts = Hash.new
131
+
132
+ @form_keys = Hash.new
133
+ @form_counts = Hash.new
134
+
135
+ end
136
+
137
+ def pause
138
+ false
139
+ end
140
+
141
+ def cancel
142
+ puts "[CRAWLER] - CANCEL!!"
143
+ #@status_lock.synchronize do
144
+ # @engine_status = CRAWL_NONE
145
+ #end
146
+ Watobo::Crawler::Status.engine = CRAWL_NONE
147
+ @grabber_threads.each do |gt|
148
+ puts "Killing Thread #{gt}"
149
+ gt.kill
150
+ gt.raise "CANCEL"
151
+ end
152
+ @grabber_threads.each{|t| t.join }
153
+
154
+ @link_queue.clear
155
+ @page_queue.clear
156
+ @grabber_threads.clear
157
+ @link_keys.clear
158
+ @link_counts.clear
159
+
160
+ @form_keys.clear
161
+ @form_counts.clear
162
+
163
+ #notify( :update_status, current_status )
164
+ puts "CANCELED - CANCELED"
165
+ # exit
166
+ end
167
+
179
168
  def run(url, opts={})
180
- @engine_status = CRAWL_RUNNING
181
-
182
- @opts.update opts
169
+ #engine_status = CRAWL_RUNNING
170
+ Watobo::Crawler::Status.reset
171
+ Watobo::Crawler::Status.engine = CRAWL_RUNNING
172
+
173
+ @opts.update opts
183
174
  @opts[:head_request_pattern] = '' if @opts[:head_request_pattern].nil?
184
175
 
185
176
  puts "crawler settings:"
186
177
  puts @opts.to_json
187
-
188
-
189
- @link_queue = Queue.new
190
- @page_queue = Queue.new
191
- @link_keys = Hash.new
192
- @link_counts = Hash.new
193
-
194
- @form_keys = Hash.new
195
- @form_counts = Hash.new
178
+
179
+
180
+ @link_queue = Queue.new
181
+ @page_queue = Queue.new
196
182
 
197
- @skipped_sites = Hash.new
198
-
199
- @grabber_threads = []
200
- start_link = URI.parse url
201
- return false if start_link.host.nil?
202
-
203
- allow_host(start_link)
204
-
205
- @link_queue.enq LinkBag.new(start_link, 0)
206
-
207
-
208
- notify(:log, "Crawling #{url} started ..." )
209
-
210
- @opts[:max_threads].times do |i|
211
- g = Grabber.new(@link_queue, @page_queue, @opts )
212
- @grabber_threads << g.run
213
- end
214
-
215
- puts "* startet #{@grabber_threads.length} grabbers"
216
-
217
- loop do
218
- pagebag = @page_queue.deq
219
-
220
- process_links(pagebag)
221
-
222
- process_forms(pagebag)
223
- @stats[:total_requests] += 1 unless pagebag.nil?
224
-
225
- puts "Links/Pages: #{@link_queue.size}/#{@page_queue.size}"
226
- notify( :update_status, current_status )
227
- # if @link_queue.empty? and @page_queue.empty?
228
- if @page_queue.empty?
229
- # if page_queue is empty wait for all grabber threads finishing the link_queue
230
- until @link_queue.num_waiting == @grabber_threads.length
231
- Thread.pass
232
- end
233
- # when the link_queue is finished check the page_queue. Crawling is finished if page_queue is empty too.
234
- if @page_queue.empty?
235
- @grabber_threads.each { |t| t.kill }
236
- puts "Finished Crawling"
237
- @status_lock.synchronize{ @engine_status = CRAWL_NONE }
238
- notify(:log, "Crawling finished")
239
- notify( :update_status, current_status )
240
- break
241
-
242
- end
243
- end
244
-
245
- end
246
-
247
- end
248
-
249
- private
250
-
251
- def current_status
183
+ @link_keys = Hash.new
184
+ @link_counts = Hash.new
185
+
186
+ @form_keys = Hash.new
187
+ @form_counts = Hash.new
188
+
189
+ @skipped_sites = Hash.new
190
+
191
+ @grabber_threads = []
192
+ start_link = URI.parse url
193
+ return false if start_link.host.nil?
194
+
195
+ allow_host(start_link)
196
+
197
+ @link_queue.enq LinkBag.new(start_link, 0)
198
+
199
+
200
+ notify(:log, "Crawling #{url} started ..." )
201
+
202
+ @opts[:max_threads].times do |i|
203
+ g = Grabber.new(@link_queue, @page_queue, @opts )
204
+ @grabber_threads << g.run
205
+ end
206
+
207
+ puts "* startet #{@grabber_threads.length} grabbers"
208
+
209
+ loop do
210
+ pagebag = @page_queue.deq
211
+
212
+ process_links(pagebag)
213
+
214
+ process_forms(pagebag)
215
+ #@stats[:total_requests] += 1 unless pagebag.nil?
216
+ Watobo::Crawler::Status.inc_requests() unless pagebag.nil?
217
+ Watobo::Crawler::Status.page_size= @page_queue.size
218
+ Watobo::Crawler::Status.link_size= @link_queue.size
219
+
220
+ puts "Links/Pages: #{@link_queue.size}/#{@page_queue.size}"
221
+ #notify( :update_status, current_status )
222
+ # if @link_queue.empty? and @page_queue.empty?
223
+ if @page_queue.empty?
224
+ # if page_queue is empty wait for all grabber threads finishing the link_queue
225
+ until @link_queue.num_waiting == @grabber_threads.length
226
+ Thread.pass
227
+ end
228
+ # when the link_queue is finished check the page_queue. Crawling is finished if page_queue is empty too.
229
+ if @page_queue.empty?
230
+ @grabber_threads.each { |t| t.kill }
231
+ puts "Finished Crawling"
232
+ #@status_lock.synchronize{ @engine_status = CRAWL_NONE }
233
+ Watobo::Crawler::Status.engine = CRAWL_NONE
234
+
235
+ notify(:log, "Crawling finished")
236
+ #notify( :update_status, current_status )
237
+ break
238
+
239
+ end
240
+ end
241
+
242
+ end
243
+
244
+ end
245
+
246
+ private
247
+
248
+ def current_status
252
249
  {
253
- :engine_status => @engine_status,
254
- :link_size => @link_queue.size,
255
- :page_size => @page_queue.size
256
- }.update @stats
257
-
258
- end
259
-
260
-
261
- def allow_host(uri)
262
- if uri.is_a? URI
263
- site = uri.site.to_s
264
- # puts "Valid Site: #{site}"
265
- ah = allowed_hosts
266
- ah << site
267
- end
268
- end
269
-
270
- def process_forms(pagebag)
271
- return false unless pagebag.respond_to? :page
272
- page=pagebag.page
273
- return false unless page.respond_to? :forms
274
- page.forms.each do |f|
275
-
276
- action = page.uri.merge f.action unless f.action =~ /^http/
277
- f.action = action.to_s
278
-
279
- if send_form? f
280
- # puts "SUBMIT FORM: #{f.action}"
281
- send_form(f, pagebag.depth)
282
- end
283
- end
284
- end
285
-
286
- def process_links(pagebag)
287
- return false unless pagebag.respond_to? :page
288
- page = pagebag.page
289
- return false unless page.respond_to? :links
290
-
291
- page.links.each do |l|
292
- begin
250
+ :engine_status => @engine_status,
251
+ :link_size => @link_queue.size,
252
+ :page_size => @page_queue.size
253
+ }.update @stats
254
+
255
+ end
256
+
257
+
258
+ def allow_host(uri)
259
+ if uri.is_a? URI
260
+ site = uri.site.to_s
261
+ # puts "Valid Site: #{site}"
262
+ ah = allowed_hosts
263
+ ah << site
264
+ end
265
+ end
266
+
267
+ def process_forms(pagebag)
268
+ return false unless pagebag.respond_to? :page
269
+ page=pagebag.page
270
+ return false unless page.respond_to? :forms
271
+ page.forms.each do |f|
272
+
273
+ action = page.uri.merge f.action unless f.action =~ /^http/
274
+ f.action = action.to_s
275
+
276
+ if send_form? f
277
+ # puts "SUBMIT FORM: #{f.action}"
278
+ send_form(f, pagebag.depth)
279
+ end
280
+ end
281
+ end
282
+
283
+ def process_links(pagebag)
284
+ return false unless pagebag.respond_to? :page
285
+ page = pagebag.page
286
+ return false unless page.respond_to? :links
287
+
288
+ page.links.each do |l|
289
+ begin
293
290
  link = l
294
291
  next if l.href.nil?
295
-
296
- link = page.uri.merge l.uri unless l.href =~ /^http/
297
- # puts "FOLLOW LINK #{link} ?"
298
- if follow_link? link
299
- # puts ">> OK"
300
- submit_link(link, pagebag.depth)
301
- else
302
- # puts ">> NO"
303
- end
304
- rescue => bang
292
+
293
+ link = page.uri.merge l.uri unless l.href =~ /^http/
294
+ # puts "FOLLOW LINK #{link} ?"
295
+ if follow_link? link
296
+ # puts ">> OK"
297
+ submit_link(link, pagebag.depth)
298
+ else
299
+ # puts ">> NO"
300
+ end
301
+ rescue => bang
305
302
  puts bang
306
- puts bang.backtrace if $DEBUG
307
- end
308
- end
309
-
310
- end
311
-
312
-
313
- def submit_link(link, depth)
314
- # @link_keys[link_key(link)] = link
315
-
316
- clk = link_key(link, :clear_values => true)
317
- @link_counts[clk] ||= 0
318
- @link_counts[clk] += 1
319
- lk = link_key(link)
320
- return false if @link_keys.has_key? lk
321
- @link_keys[lk] = nil
322
- if @link_counts[clk] < @opts[:max_repeat]
323
- @link_queue.enq LinkBag.new(link, depth)
324
- else
325
- puts "! MAX REPEAT !\nSkipped link #{link}" if $DEBUG
326
- end
327
- end
328
-
329
- def form_key(form, opts={} )
330
- o = { :clear_values => false }
331
- o.update opts
332
-
333
- fp = "#{form.action}"
334
- fp << form.method
335
- if form.request_data =~ /=/
336
- data = form.request_data.split("&").sort.join("&")
337
- if o[:clear_values]
338
- fp << data.gsub(/=[^&]*/,'=')
339
- else
340
- fp << data
341
- end
342
- end
343
- fkey = Digest::MD5.hexdigest fp
344
- fkey
345
- end
346
-
347
- def send_form(form, depth)
348
- return false if @engine_status == CRAWL_NONE
349
- cfk = form_key(form, :clear_values => true)
350
- @form_counts[cfk] ||= 0
351
- @form_counts[cfk] += 1
352
-
353
- # @form_keys[form_key(form)] = form
354
- fk = form_key(form)
355
- return false if @form_keys.has_key? fk
356
- @form_keys[fk] = nil
357
- begin
358
- if @form_counts[cfk] < @opts[:max_repeat]
359
- if form.buttons.length > 0
360
- p = form.click_button
361
- else
362
- p = form.submit()
363
- end
364
- puts p.class
365
- @page_queue.enq PageBag.new(p, depth+1)
366
- else
367
- puts "! MAX REPEAT !\nSkipped Form #{form.action}"
368
- end
369
- rescue => bang
370
- puts bang
371
- puts bang.backtrace
372
- end
373
- end
374
-
375
- def send_form?(form)
376
- # puts "SEND FORM?"
377
- return false unless engine_running?
378
- return false unless @opts[:submit_forms] == true
379
- # puts "> submit_forms"
380
- return false unless allowed? form.action
381
- #puts "> allowed"
382
- return false unless fields_allowed? form
383
- #puts "> fields allowed"
384
- return false if form_sent? form
385
- # puts "> form not sent"
386
- return true
387
- end
388
-
389
- def follow_link?(link)
390
- return false unless allowed? link
391
- return false if link_is_followed? link
392
- return true
393
- end
394
-
395
- def host_allowed?(uri)
396
- #puts "ALLOWED HOSTS =>"
397
- #puts allowed_hosts
398
- #puts "---"
399
- # puts "Host Allowed?"
400
- ah = allowed_hosts
401
- # puts ah.class
402
- #puts ah
403
- return false if ah.empty?
404
- ahc = ah.select{ |h| uri.site =~ /^#{h}$/ }.length
405
- if ahc > 0
406
- # puts "> Host IS allowed!"
407
- return true
408
- end
409
- # puts "> Host is NOT allowed!"
410
- return false
411
- end
412
-
413
- def url_allowed?(uri)
414
- # puts "* excluded_urls"
415
- # puts exluded_urls
416
- return false if excluded_urls.select{ |url| uri.path =~ /#{url}/ }.length > 0
417
- # puts "* allowed_urls"
418
- # puts allowed_urls
419
- return true if allowed_urls.empty?
420
- return true if allowed_urls.select{ |url| uri.path =~ /#{url}/ }.length > 0
421
- # puts "> URL is NOT allowed"
422
- return false
423
- end
424
-
425
- def path_allowed?(uri)
426
- return true if root_path.nil?
427
- return true if root_path.empty?
428
- return true if uri.path =~ /^#{root_path}/
429
- # puts "> PATH is NOT ALLOWED"
430
- return false
431
- end
432
-
433
- def cleanup_uri(obj)
434
- uri = nil
435
- uri = obj.uri if obj.respond_to? :uri
436
- uri = URI.parse(obj) if obj.is_a? String
437
- uri = obj if obj.is_a? URI::HTTP
438
- uri
439
- end
440
-
441
- def allowed?(link)
442
- valid = false
443
- # need to handle different link objects, Mechanize::Page::Link and URIs
444
- uri = nil
445
- uri = link.uri if link.respond_to? :uri
446
- uri = URI.parse(link) if link.is_a? String
447
- uri = link if link.is_a? URI::HTTP
448
-
449
- return false if uri.nil?
450
-
451
- host_allowed?(uri) &&
452
- url_allowed?(uri) &&
453
- path_allowed?(uri)
454
- end
455
-
456
- def form_sent?(form)
457
-
458
- @form_keys.has_key? form_key(form)
459
- end
460
-
461
- def link_key(link, opts={})
462
- o = { :clear_values => false }
463
- o.update opts
464
-
465
- uri = cleanup_uri(link)
466
-
467
- query_sorted = ""
468
- query_sorted = uri.query.split("&").sort.join("&") unless uri.query.nil?
469
-
470
- key = ""
471
- key << uri.scheme
472
- key << uri.site
473
- key << uri.path
474
- key << query_sorted
475
- key.gsub!(/=[^&]*/,'=') if o[:clear_values] == true
476
-
477
- Digest::MD5.hexdigest key
478
- end
479
-
480
- def engine_running?
481
- @status_lock.synchronize do
482
- return false if @engine_status == CRAWL_NONE
483
- return true
484
- end
485
- end
486
-
487
- def link_is_followed?(link)
488
-
489
- return true if @link_keys.has_key? link_key(link)
490
-
491
- false
492
- end
493
-
494
- def fields_allowed?(form)
495
- form.fields.each do |f|
496
- excluded_fields.each do |ef|
497
- return false if f.name =~ /#{ef}/
498
- end
499
- end
500
- return true
501
- end
502
-
503
- def method_missing(name, *args, &block)
504
- # puts "* instance method missing (#{name})"
505
- if name =~ /(.*)=$/
506
- @opts.has_key? $1.to_sym || super
507
- @opts[$1.to_sym] = args[0]
508
- return @opts[$1.to_sym]
509
- else
510
- k = name.to_sym
511
- @opts.has_key? k || super
512
- # puts "Value Found For #{k.to_yaml}"
513
- return @opts[k]
514
-
515
- end
516
- end
517
- end
518
- end
519
-
520
- end
303
+ puts bang.backtrace if $DEBUG
304
+ end
305
+ end
306
+
307
+ end
308
+
309
+
310
+ def submit_link(link, depth)
311
+ # @link_keys[link_key(link)] = link
312
+
313
+ clk = link_key(link, :clear_values => true)
314
+ @link_counts[clk] ||= 0
315
+ @link_counts[clk] += 1
316
+ lk = link_key(link)
317
+ return false if @link_keys.has_key? lk
318
+ @link_keys[lk] = nil
319
+ if @link_counts[clk] < @opts[:max_repeat]
320
+ @link_queue.enq LinkBag.new(link, depth)
321
+ else
322
+ puts "! MAX REPEAT !\nSkipped link #{link}" if $DEBUG
323
+ end
324
+ end
325
+
326
+ def form_key(form, opts={} )
327
+ o = { :clear_values => false }
328
+ o.update opts
329
+
330
+ fp = "#{form.action}"
331
+ fp << form.method
332
+ if form.request_data =~ /=/
333
+ data = form.request_data.split("&").sort.join("&")
334
+ if o[:clear_values]
335
+ fp << data.gsub(/=[^&]*/,'=')
336
+ else
337
+ fp << data
338
+ end
339
+ end
340
+ fkey = Digest::MD5.hexdigest fp
341
+ fkey
342
+ end
343
+
344
+ def send_form(form, depth)
345
+ return false if @engine_status == CRAWL_NONE
346
+ cfk = form_key(form, :clear_values => true)
347
+ @form_counts[cfk] ||= 0
348
+ @form_counts[cfk] += 1
349
+
350
+ # @form_keys[form_key(form)] = form
351
+ fk = form_key(form)
352
+ return false if @form_keys.has_key? fk
353
+ @form_keys[fk] = nil
354
+ begin
355
+ if @form_counts[cfk] < @opts[:max_repeat]
356
+ if form.buttons.length > 0
357
+ p = form.click_button
358
+ else
359
+ p = form.submit()
360
+ end
361
+ puts p.class
362
+ @page_queue.enq PageBag.new(p, depth+1)
363
+ else
364
+ puts "! MAX REPEAT !\nSkipped Form #{form.action}"
365
+ end
366
+ rescue => bang
367
+ puts bang
368
+ puts bang.backtrace
369
+ end
370
+ end
371
+
372
+ def send_form?(form)
373
+ # puts "SEND FORM?"
374
+ return false unless engine_running?
375
+ return false unless @opts[:submit_forms] == true
376
+ # puts "> submit_forms"
377
+ return false unless allowed? form.action
378
+ #puts "> allowed"
379
+ return false unless fields_allowed? form
380
+ #puts "> fields allowed"
381
+ return false if form_sent? form
382
+ # puts "> form not sent"
383
+ return true
384
+ end
385
+
386
+ def follow_link?(link)
387
+ return false unless allowed? link
388
+ return false if link_is_followed? link
389
+ return true
390
+ end
391
+
392
+ def host_allowed?(uri)
393
+ #puts "ALLOWED HOSTS =>"
394
+ #puts allowed_hosts
395
+ #puts "---"
396
+ # puts "Host Allowed?"
397
+ ah = allowed_hosts
398
+ # puts ah.class
399
+ #puts ah
400
+ return false if ah.empty?
401
+ ahc = ah.select{ |h| uri.site =~ /^#{h}$/ }.length
402
+ if ahc > 0
403
+ # puts "> Host IS allowed!"
404
+ return true
405
+ end
406
+ # puts "> Host is NOT allowed!"
407
+ return false
408
+ end
409
+
410
+ def url_allowed?(uri)
411
+ # puts "* excluded_urls"
412
+ # puts exluded_urls
413
+ return false if excluded_urls.select{ |url| uri.path =~ /#{url}/ }.length > 0
414
+ # puts "* allowed_urls"
415
+ # puts allowed_urls
416
+ return true if allowed_urls.empty?
417
+ return true if allowed_urls.select{ |url| uri.path =~ /#{url}/ }.length > 0
418
+ # puts "> URL is NOT allowed"
419
+ return false
420
+ end
421
+
422
+ def path_allowed?(uri)
423
+ return true if root_path.nil?
424
+ return true if root_path.empty?
425
+ return true if uri.path =~ /^#{root_path}/
426
+ # puts "> PATH is NOT ALLOWED"
427
+ return false
428
+ end
429
+
430
+ def cleanup_uri(obj)
431
+ uri = nil
432
+ uri = obj.uri if obj.respond_to? :uri
433
+ uri = URI.parse(obj) if obj.is_a? String
434
+ uri = obj if obj.is_a? URI::HTTP
435
+ uri
436
+ end
437
+
438
+ def allowed?(link)
439
+ valid = false
440
+ # need to handle different link objects, Mechanize::Page::Link and URIs
441
+ uri = nil
442
+ uri = link.uri if link.respond_to? :uri
443
+ uri = URI.parse(link) if link.is_a? String
444
+ uri = link if link.is_a? URI::HTTP
445
+
446
+ return false if uri.nil?
447
+
448
+ host_allowed?(uri) &&
449
+ url_allowed?(uri) &&
450
+ path_allowed?(uri)
451
+ end
452
+
453
+ def form_sent?(form)
454
+
455
+ @form_keys.has_key? form_key(form)
456
+ end
457
+
458
+ def link_key(link, opts={})
459
+ o = { :clear_values => false }
460
+ o.update opts
461
+
462
+ uri = cleanup_uri(link)
463
+
464
+ query_sorted = ""
465
+ query_sorted = uri.query.split("&").sort.join("&") unless uri.query.nil?
466
+
467
+ key = ""
468
+ key << uri.scheme
469
+ key << uri.site
470
+ key << uri.path
471
+ key << query_sorted
472
+ key.gsub!(/=[^&]*/,'=') if o[:clear_values] == true
473
+
474
+ Digest::MD5.hexdigest key
475
+ end
476
+
477
+ def engine_running?
478
+ @status_lock.synchronize do
479
+ return false if @engine_status == CRAWL_NONE
480
+ return true
481
+ end
482
+ end
483
+
484
+ def link_is_followed?(link)
485
+
486
+ return true if @link_keys.has_key? link_key(link)
487
+
488
+ false
489
+ end
490
+
491
+ def fields_allowed?(form)
492
+ form.fields.each do |f|
493
+ excluded_fields.each do |ef|
494
+ return false if f.name =~ /#{ef}/
495
+ end
496
+ end
497
+ return true
498
+ end
499
+
500
+ def method_missing(name, *args, &block)
501
+ # puts "* instance method missing (#{name})"
502
+ if name =~ /(.*)=$/
503
+ @opts.has_key? $1.to_sym || super
504
+ @opts[$1.to_sym] = args[0]
505
+ return @opts[$1.to_sym]
506
+ else
507
+ k = name.to_sym
508
+ @opts.has_key? k || super
509
+ # puts "Value Found For #{k.to_yaml}"
510
+ return @opts[k]
511
+
512
+ end
513
+ end
514
+ end
515
+ end
516
+
517
+ end