watobo 0.9.21 → 0.9.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +46 -1
- data/bin/nfq_server.rb +0 -9
- data/bin/watobo_gui.rb +3 -13
- data/custom-views/prettify-json.rb +9 -18
- data/icons/watobo.ico +0 -0
- data/icons/watobo.ico.old +0 -0
- data/lib/watobo.rb +10 -19
- data/lib/watobo/adapters.rb +5 -14
- data/lib/watobo/adapters/data_store.rb +50 -59
- data/lib/watobo/adapters/file/file_store.rb +287 -296
- data/lib/watobo/adapters/file/marshal_store.rb +293 -296
- data/lib/watobo/adapters/session_store.rb +5 -14
- data/lib/watobo/ca.rb +1 -10
- data/lib/watobo/config.rb +197 -206
- data/lib/watobo/constants.rb +0 -9
- data/lib/watobo/core.rb +3 -12
- data/lib/watobo/core/active_check.rb +72 -135
- data/lib/watobo/core/active_checks.rb +49 -58
- data/lib/watobo/core/ca.rb +369 -389
- data/lib/watobo/core/cert_store.rb +34 -43
- data/lib/watobo/core/chat.rb +92 -101
- data/lib/watobo/core/chats.rb +271 -280
- data/lib/watobo/core/client_cert_store.rb +106 -35
- data/lib/watobo/core/conversation.rb +48 -57
- data/lib/watobo/core/cookie.rb +23 -32
- data/lib/watobo/core/egress_handlers.rb +98 -0
- data/lib/watobo/core/finding.rb +66 -75
- data/lib/watobo/core/findings.rb +107 -114
- data/lib/watobo/core/forwarding_proxy.rb +13 -22
- data/lib/watobo/core/fuzz_gen.rb +0 -9
- data/lib/watobo/core/intercept_carver.rb +166 -177
- data/lib/watobo/core/intercept_filter.rb +235 -244
- data/lib/watobo/core/interceptor.rb +98 -107
- data/lib/watobo/core/min_class.rb +4 -13
- data/lib/watobo/core/netfilter_queue.rb +170 -179
- data/lib/watobo/core/ott_cache.rb +132 -141
- data/lib/watobo/core/parameter.rb +43 -52
- data/lib/watobo/core/passive_check.rb +103 -102
- data/lib/watobo/core/passive_checks.rb +48 -57
- data/lib/watobo/core/passive_scanner.rb +54 -55
- data/lib/watobo/core/plugin.rb +11 -20
- data/lib/watobo/core/project.rb +3 -9
- data/lib/watobo/core/proxy.rb +43 -52
- data/lib/watobo/core/request.rb +125 -123
- data/lib/watobo/core/response.rb +44 -53
- data/lib/watobo/core/scanner.rb +0 -9
- data/lib/watobo/core/scanner3.rb +405 -414
- data/lib/watobo/core/scope.rb +83 -92
- data/lib/watobo/core/session.rb +1043 -1026
- data/lib/watobo/core/sid_cache.rb +98 -107
- data/lib/watobo/core/subscriber.rb +25 -34
- data/lib/watobo/defaults.rb +21 -30
- data/lib/watobo/external/diff/lcs.rb +0 -9
- data/lib/watobo/external/diff/lcs/array.rb +0 -9
- data/lib/watobo/external/diff/lcs/block.rb +0 -9
- data/lib/watobo/external/diff/lcs/callbacks.rb +0 -9
- data/lib/watobo/external/diff/lcs/change.rb +0 -9
- data/lib/watobo/external/diff/lcs/hunk.rb +0 -9
- data/lib/watobo/external/diff/lcs/ldiff.rb +0 -9
- data/lib/watobo/external/diff/lcs/string.rb +0 -9
- data/lib/watobo/externals.rb +6 -15
- data/lib/watobo/framework.rb +4 -13
- data/lib/watobo/framework/create_project.rb +60 -69
- data/lib/watobo/framework/init.rb +0 -9
- data/lib/watobo/framework/init_modules.rb +0 -9
- data/lib/watobo/framework/license_text.rb +28 -37
- data/lib/watobo/framework/load_chat.rb +13 -22
- data/lib/watobo/gui.rb +132 -123
- data/lib/watobo/gui/about_watobo.rb +0 -9
- data/lib/watobo/gui/browser_preview.rb +0 -9
- data/lib/watobo/gui/certificate_dialog.rb +0 -9
- data/lib/watobo/gui/chat_diff.rb +0 -9
- data/lib/watobo/gui/chatviewer_frame.rb +73 -72
- data/lib/watobo/gui/checkboxtree.rb +0 -9
- data/lib/watobo/gui/checks_policy_frame.rb +0 -9
- data/lib/watobo/gui/client_cert_dialog.rb +96 -87
- data/lib/watobo/gui/confirm_scan_dialog.rb +0 -9
- data/lib/watobo/gui/conversation_table.rb +158 -164
- data/lib/watobo/gui/conversation_table_ctrl.rb +207 -216
- data/lib/watobo/gui/conversation_table_ctrl2.rb +373 -382
- data/lib/watobo/gui/csrf_token_dialog.rb +0 -9
- data/lib/watobo/gui/custom_viewer.rb +374 -383
- data/lib/watobo/gui/dashboard.rb +296 -303
- data/lib/watobo/gui/define_scope_frame.rb +0 -9
- data/lib/watobo/gui/differ_frame.rb +215 -224
- data/lib/watobo/gui/edit_comment.rb +0 -9
- data/lib/watobo/gui/edit_scope_dialog.rb +0 -9
- data/lib/watobo/gui/export_dialog.rb +104 -113
- data/lib/watobo/gui/finding_info.rb +0 -9
- data/lib/watobo/gui/findings_tree.rb +210 -217
- data/lib/watobo/gui/full_scan_dialog.rb +0 -9
- data/lib/watobo/gui/fuzzer_gui.rb +1295 -1313
- data/lib/watobo/gui/fxsave_thread.rb +14 -0
- data/lib/watobo/gui/goto_url_dialog.rb +70 -79
- data/lib/watobo/gui/hex_viewer.rb +0 -9
- data/lib/watobo/gui/html_viewer.rb +287 -296
- data/lib/watobo/gui/intercept_filter_dialog.rb +188 -197
- data/lib/watobo/gui/interceptor_gui.rb +1041 -1051
- data/lib/watobo/gui/interceptor_settings_dialog.rb +0 -9
- data/lib/watobo/gui/json_viewer.rb +287 -0
- data/lib/watobo/gui/list_box.rb +101 -110
- data/lib/watobo/gui/log_file_viewer.rb +32 -41
- data/lib/watobo/gui/log_viewer.rb +83 -88
- data/lib/watobo/gui/login_wizzard.rb +0 -9
- data/lib/watobo/gui/main_window.rb +587 -618
- data/lib/watobo/gui/manual_request_editor.rb +620 -565
- data/lib/watobo/gui/master_pw_dialog.rb +0 -9
- data/lib/watobo/gui/mixins/gui_settings.rb +29 -38
- data/lib/watobo/gui/page_tree.rb +217 -226
- data/lib/watobo/gui/password_policy_dialog.rb +0 -9
- data/lib/watobo/gui/plugin_board.rb +0 -9
- data/lib/watobo/gui/preferences_dialog.rb +0 -9
- data/lib/watobo/gui/progress_window.rb +17 -27
- data/lib/watobo/gui/project_wizzard.rb +0 -9
- data/lib/watobo/gui/proxy_dialog.rb +1 -10
- data/lib/watobo/gui/quick_scan_dialog.rb +0 -9
- data/lib/watobo/gui/request_builder_frame.rb +102 -111
- data/lib/watobo/gui/request_editor.rb +181 -137
- data/lib/watobo/gui/rewrite_filters_dialog.rb +394 -403
- data/lib/watobo/gui/rewrite_rules_dialog.rb +372 -381
- data/lib/watobo/gui/save_chat_dialog.rb +140 -149
- data/lib/watobo/gui/scanner_settings_dialog.rb +0 -9
- data/lib/watobo/gui/select_chat_dialog.rb +0 -9
- data/lib/watobo/gui/session_management_dialog.rb +0 -9
- data/lib/watobo/gui/sites_tree.rb +0 -9
- data/lib/watobo/gui/status_bar.rb +0 -9
- data/lib/watobo/gui/table_editor.rb +0 -9
- data/lib/watobo/gui/tagless_viewer.rb +0 -9
- data/lib/watobo/gui/templates/plugin.rb +0 -9
- data/lib/watobo/gui/templates/plugin2.rb +92 -100
- data/lib/watobo/gui/templates/plugin_base.rb +144 -153
- data/lib/watobo/gui/text_viewer.rb +0 -9
- data/lib/watobo/gui/transcoder_window.rb +0 -9
- data/lib/watobo/gui/utils/gui_utils.rb +0 -9
- data/lib/watobo/gui/utils/init_icons.rb +86 -95
- data/lib/watobo/gui/utils/load_icons.rb +33 -42
- data/lib/watobo/gui/utils/load_plugins.rb +116 -119
- data/lib/watobo/gui/utils/master_password.rb +68 -77
- data/lib/watobo/gui/utils/save_default_settings.rb +113 -122
- data/lib/watobo/gui/utils/save_project_settings.rb +0 -9
- data/lib/watobo/gui/utils/save_proxy_settings.rb +41 -50
- data/lib/watobo/gui/utils/save_scanner_settings.rb +18 -27
- data/lib/watobo/gui/utils/session_history.rb +112 -121
- data/lib/watobo/gui/workspace_dialog.rb +0 -9
- data/lib/watobo/gui/www_auth_dialog.rb +0 -9
- data/lib/watobo/gui/xml_viewer_frame.rb +0 -9
- data/lib/watobo/http.rb +4 -13
- data/lib/watobo/http/cookies/cookies.rb +26 -35
- data/lib/watobo/http/data/data.rb +45 -54
- data/lib/watobo/http/data/json.rb +47 -55
- data/lib/watobo/http/url/url.rb +38 -47
- data/lib/watobo/http/xml/xml.rb +124 -130
- data/lib/watobo/interceptor.rb +3 -12
- data/lib/watobo/interceptor/proxy.rb +742 -739
- data/lib/watobo/interceptor/transparent.rb +22 -24
- data/lib/watobo/mixins.rb +10 -19
- data/lib/watobo/mixins/check_info.rb +27 -36
- data/lib/watobo/mixins/httpparser.rb +613 -637
- data/lib/watobo/mixins/request_parser.rb +88 -97
- data/lib/watobo/mixins/shapers.rb +515 -529
- data/lib/watobo/mixins/transcoders.rb +3 -11
- data/lib/watobo/parser.rb +1 -10
- data/lib/watobo/parser/html.rb +83 -92
- data/lib/watobo/patch_fxruby_setfocus.rb +26 -0
- data/lib/watobo/sockets.rb +3 -12
- data/lib/watobo/sockets/agent.rb +828 -837
- data/lib/watobo/sockets/client_socket.rb +308 -312
- data/lib/watobo/sockets/connection.rb +401 -410
- data/lib/watobo/sockets/http_socket.rb +11 -13
- data/lib/watobo/sockets/ntlm_auth.rb +129 -138
- data/lib/watobo/utils.rb +10 -19
- data/lib/watobo/utils/check_regex.rb +0 -9
- data/lib/watobo/utils/copy_object.rb +0 -9
- data/lib/watobo/utils/crypto.rb +0 -9
- data/lib/watobo/utils/expand_range.rb +23 -32
- data/lib/watobo/utils/export_xml.rb +97 -106
- data/lib/watobo/utils/file_management.rb +9 -11
- data/lib/watobo/utils/hexprint.rb +9 -18
- data/lib/watobo/utils/load_chat.rb +0 -9
- data/lib/watobo/utils/load_icon.rb +0 -9
- data/lib/watobo/utils/ntlm.rb +866 -875
- data/lib/watobo/utils/print_debug.rb +12 -21
- data/lib/watobo/utils/response_builder.rb +90 -99
- data/lib/watobo/utils/response_hash.rb +0 -9
- data/lib/watobo/utils/secure_eval.rb +0 -9
- data/lib/watobo/utils/strings.rb +10 -19
- data/lib/watobo/utils/text2request.rb +0 -9
- data/lib/watobo/utils/url.rb +23 -32
- data/lib/watobo/utils/utf16.rb +11 -20
- data/modules/active/Apache/mod_status.rb +0 -9
- data/modules/active/Apache/multiview.rb +151 -160
- data/modules/active/Flash/crossdomain.rb +0 -9
- data/modules/active/JWT/jwt_oauth2_none.rb +111 -0
- data/modules/active/cq5/cq5_default_selectors.rb +106 -115
- data/modules/active/cq5/cqp_user_enumeration.rb +125 -134
- data/modules/active/directories/dirwalker.rb +0 -9
- data/modules/active/discovery/fileextensions.rb +0 -9
- data/modules/active/discovery/http_methods.rb +0 -9
- data/modules/active/discovery/jsmapfiles.rb +79 -0
- data/modules/active/domino/domino_db.rb +68 -76
- data/modules/active/dotNET/custom_errors.rb +102 -111
- data/modules/active/dotNET/dotnet_files.rb +90 -99
- data/modules/active/fileinclusion/lfi_simple.rb +0 -9
- data/modules/active/jboss/jboss_basic.rb +0 -9
- data/modules/active/sap/business_objects.rb +51 -60
- data/modules/active/sap/its_commands.rb +0 -9
- data/modules/active/sap/its_service_parameter.rb +0 -9
- data/modules/active/sap/its_services.rb +0 -9
- data/modules/active/sap/its_xss.rb +0 -9
- data/modules/active/shell_shock/shell_shock.rb +139 -148
- data/modules/active/siebel/siebel_apps.rb +160 -169
- data/modules/active/sqlinjection/sql_boolean.rb +0 -9
- data/modules/active/sqlinjection/sql_numerical.rb +198 -0
- data/modules/active/sqlinjection/sqli_error.rb +0 -9
- data/modules/active/sqlinjection/sqli_timing.rb +220 -229
- data/modules/active/struts2/default_handler_ognl.rb +106 -115
- data/modules/active/struts2/include_params_ognl.rb +105 -114
- data/modules/active/xml/xml_xxe.rb +112 -123
- data/modules/active/xss/xss_ng.rb +214 -223
- data/modules/active/xss/xss_simple.rb +0 -9
- data/modules/passive/ajax.rb +68 -77
- data/modules/passive/autocomplete.rb +56 -65
- data/modules/passive/cookie_options.rb +0 -9
- data/modules/passive/cookie_xss.rb +0 -9
- data/modules/passive/detect_code.rb +0 -9
- data/modules/passive/detect_fileupload.rb +0 -9
- data/modules/passive/detect_infrastructure.rb +0 -9
- data/modules/passive/detect_one_time_tokens.rb +0 -9
- data/modules/passive/dirindexing.rb +0 -9
- data/modules/passive/disclosure_domino.rb +55 -64
- data/modules/passive/disclosure_emails.rb +0 -9
- data/modules/passive/disclosure_ipaddr.rb +55 -53
- data/modules/passive/filename_as_parameter.rb +0 -9
- data/modules/passive/form_spotter.rb +0 -9
- data/modules/passive/hidden_fields.rb +50 -59
- data/modules/passive/hotspots.rb +0 -9
- data/modules/passive/in_script_parameter.rb +0 -9
- data/modules/passive/json_web_token.rb +93 -0
- data/modules/passive/multiple_server_headers.rb +0 -9
- data/modules/passive/possible_login.rb +0 -9
- data/modules/passive/redirect_url.rb +0 -9
- data/modules/passive/redirectionz.rb +0 -9
- data/modules/passive/sap-headers.rb +56 -65
- data/modules/passive/xss_dom.rb +0 -9
- data/plugins/aem/aem.rb +11 -20
- data/plugins/aem/gui/main.rb +118 -127
- data/plugins/aem/gui/tree_view.rb +171 -180
- data/plugins/aem/lib/agent.rb +130 -138
- data/plugins/aem/lib/dispatcher.rb +45 -51
- data/plugins/aem/lib/engine.rb +177 -186
- data/plugins/catalog/catalog.rb +345 -355
- data/plugins/crawler/crawler.rb +4 -13
- data/plugins/crawler/gui.rb +5 -14
- data/plugins/crawler/gui/auth_frame.rb +270 -279
- data/plugins/crawler/gui/crawler_gui.rb +271 -276
- data/plugins/crawler/gui/general_settings_frame.rb +96 -105
- data/plugins/crawler/gui/hooks_frame.rb +80 -89
- data/plugins/crawler/gui/scope_frame.rb +50 -59
- data/plugins/crawler/gui/settings_tabbook.rb +38 -47
- data/plugins/crawler/gui/status_frame.rb +59 -68
- data/plugins/crawler/lib/bags.rb +18 -27
- data/plugins/crawler/lib/constants.rb +11 -20
- data/plugins/crawler/lib/engine.rb +488 -497
- data/plugins/crawler/lib/grabber.rb +68 -77
- data/plugins/crawler/lib/status.rb +71 -80
- data/plugins/crawler/lib/uri_mp.rb +12 -21
- data/plugins/filefinder/filefinder.rb +326 -333
- data/plugins/sqlmap/bin/test.rb +78 -87
- data/plugins/sqlmap/gui.rb +4 -13
- data/plugins/sqlmap/gui/main.rb +218 -227
- data/plugins/sqlmap/gui/options_frame.rb +97 -106
- data/plugins/sqlmap/lib/sqlmap_ctrl.rb +90 -100
- data/plugins/sqlmap/sqlmap.rb +2 -11
- data/plugins/sslchecker/cli/sslchecker_cli.rb +0 -9
- data/plugins/sslchecker/gui/cipher_table.rb +246 -254
- data/plugins/sslchecker/gui/gui.rb +258 -264
- data/plugins/sslchecker/gui/sslchecker.rb +4 -13
- data/plugins/sslchecker/lib/check.rb +127 -133
- data/plugins/wshell/gui/main.rb +119 -117
- data/plugins/wshell/lib/core.rb +38 -88
- data/plugins/wshell/wshell.rb +11 -20
- metadata +170 -164
|
@@ -1,49 +1,40 @@
|
|
|
1
|
-
#.
|
|
2
|
-
# settings_tabbook.rb
|
|
3
|
-
#.
|
|
4
|
-
# Copyright 2014 by siberas, http://www.siberas.de
|
|
5
|
-
# This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
|
|
6
|
-
# WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
|
|
7
|
-
# WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
|
8
|
-
# You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
9
|
-
|
|
10
1
|
# @private
|
|
11
|
-
module Watobo#:nodoc: all
|
|
12
|
-
module Plugin
|
|
13
|
-
module Crawler
|
|
14
|
-
class Gui
|
|
15
|
-
class SettingsTabBook < FXTabBook
|
|
16
|
-
attr :hooks, :general, :log_viewer, :auth, :scope
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def initialize(owner)
|
|
21
|
-
#@tab = FXTabBook.new(self, nil, 0, LAYOUT_FILL_X|LAYOUT_FILL_Y|LAYOUT_RIGHT)
|
|
22
|
-
super(owner, nil, 0, LAYOUT_FILL_X|LAYOUT_FILL_Y|LAYOUT_RIGHT)
|
|
23
|
-
FXTabItem.new(self, "General", nil)
|
|
24
|
-
# frame = FXVerticalFrame.new(self, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_RAISED)
|
|
25
|
-
@general = GeneralSettingsFrame.new(self)
|
|
26
|
-
|
|
27
|
-
FXTabItem.new(self, "Scope", nil)
|
|
28
|
-
@scope = ScopeFrame.new(self)
|
|
29
|
-
|
|
30
|
-
FXTabItem.new(self, "Auth", nil)
|
|
31
|
-
@auth = AuthFrame.new(self)
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
FXTabItem.new(self, "Hooks", nil)
|
|
35
|
-
@hooks = HooksFrame.new(self)
|
|
36
|
-
|
|
37
|
-
FXTabItem.new(self, "Log", nil)
|
|
38
|
-
frame = FXVerticalFrame.new(self, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_THICK|FRAME_RAISED)
|
|
39
|
-
@log_viewer = Watobo::Gui::LogViewer.new(frame, :append, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_SUNKEN)
|
|
40
|
-
|
|
41
|
-
self.connect(SEL_COMMAND){
|
|
42
|
-
@hooks.selected if self.current == 3
|
|
43
|
-
}
|
|
44
|
-
end
|
|
45
|
-
end
|
|
46
|
-
end
|
|
47
|
-
end
|
|
48
|
-
end
|
|
2
|
+
module Watobo#:nodoc: all
|
|
3
|
+
module Plugin
|
|
4
|
+
module Crawler
|
|
5
|
+
class Gui
|
|
6
|
+
class SettingsTabBook < FXTabBook
|
|
7
|
+
attr :hooks, :general, :log_viewer, :auth, :scope
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def initialize(owner)
|
|
12
|
+
#@tab = FXTabBook.new(self, nil, 0, LAYOUT_FILL_X|LAYOUT_FILL_Y|LAYOUT_RIGHT)
|
|
13
|
+
super(owner, nil, 0, LAYOUT_FILL_X|LAYOUT_FILL_Y|LAYOUT_RIGHT)
|
|
14
|
+
FXTabItem.new(self, "General", nil)
|
|
15
|
+
# frame = FXVerticalFrame.new(self, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_RAISED)
|
|
16
|
+
@general = GeneralSettingsFrame.new(self)
|
|
17
|
+
|
|
18
|
+
FXTabItem.new(self, "Scope", nil)
|
|
19
|
+
@scope = ScopeFrame.new(self)
|
|
20
|
+
|
|
21
|
+
FXTabItem.new(self, "Auth", nil)
|
|
22
|
+
@auth = AuthFrame.new(self)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
FXTabItem.new(self, "Hooks", nil)
|
|
26
|
+
@hooks = HooksFrame.new(self)
|
|
27
|
+
|
|
28
|
+
FXTabItem.new(self, "Log", nil)
|
|
29
|
+
frame = FXVerticalFrame.new(self, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_THICK|FRAME_RAISED)
|
|
30
|
+
@log_viewer = Watobo::Gui::LogViewer.new(frame, :append, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_SUNKEN)
|
|
31
|
+
|
|
32
|
+
self.connect(SEL_COMMAND){
|
|
33
|
+
@hooks.selected if self.current == 3
|
|
34
|
+
}
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
49
40
|
end
|
|
@@ -1,71 +1,62 @@
|
|
|
1
|
-
#.
|
|
2
|
-
# status_frame.rb
|
|
3
|
-
#.
|
|
4
|
-
# Copyright 2014 by siberas, http://www.siberas.de
|
|
5
|
-
# This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
|
|
6
|
-
# WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
|
|
7
|
-
# WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
|
8
|
-
# You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
9
|
-
|
|
10
1
|
# @private
|
|
11
|
-
module Watobo#:nodoc: all
|
|
12
|
-
module Plugin
|
|
13
|
-
module Crawler
|
|
14
|
-
class Gui
|
|
15
|
-
class StatusFrame < FXHorizontalFrame
|
|
16
|
-
|
|
17
|
-
include Watobo::Plugin::Crawler::Constants
|
|
18
|
-
# :engine_status => CRAWL_NONE,
|
|
19
|
-
# :page_size => 0,
|
|
20
|
-
# :link_size => 0,
|
|
21
|
-
# :skipped_domains => 0
|
|
2
|
+
module Watobo#:nodoc: all
|
|
3
|
+
module Plugin
|
|
4
|
+
module Crawler
|
|
5
|
+
class Gui
|
|
6
|
+
class StatusFrame < FXHorizontalFrame
|
|
7
|
+
|
|
8
|
+
include Watobo::Plugin::Crawler::Constants
|
|
9
|
+
# :engine_status => CRAWL_NONE,
|
|
10
|
+
# :page_size => 0,
|
|
11
|
+
# :link_size => 0,
|
|
12
|
+
# :skipped_domains => 0
|
|
22
13
|
def update_status(status)
|
|
23
|
-
#puts status.to_yaml
|
|
24
|
-
if status.has_key? :engine_status
|
|
25
|
-
case status[:engine_status]
|
|
26
|
-
when CRAWL_NONE
|
|
27
|
-
self.backColor = self.parent.backColor
|
|
28
|
-
@status_txt.text = "Status: Idle"
|
|
29
|
-
when CRAWL_RUNNING
|
|
30
|
-
self.backColor = FXColor::Red
|
|
31
|
-
@status_txt.text = "Status: Running"
|
|
32
|
-
|
|
33
|
-
when CRAWL_PAUSED
|
|
34
|
-
self.backColor = FXColor::Yellow
|
|
35
|
-
@status_txt.text = "Status: Paused"
|
|
36
|
-
end
|
|
37
|
-
end
|
|
38
|
-
|
|
39
|
-
if status.has_key? :link_size
|
|
40
|
-
@link_size_txt.text = "Links: #{status[:link_size]}"
|
|
41
|
-
end
|
|
42
|
-
|
|
43
|
-
if status.has_key? :page_size
|
|
44
|
-
@page_size_txt.text = "Pages: #{status[:page_size]}"
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
if status.has_key? :total_requests
|
|
48
|
-
@requests_txt.text = "Requests: #{status[:total_requests]}"
|
|
49
|
-
end
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
def initialize(owner)
|
|
53
|
-
super(owner, :opts => LAYOUT_FILL_X|FRAME_RAISED)
|
|
54
|
-
@info_fields = []
|
|
55
|
-
#frame = FXHorizontalFrame.new(, :opts => LAYOUT_FILL_Y, :padding => 0)
|
|
56
|
-
frame = self
|
|
57
|
-
@info_fields << ( @status_txt = FXLabel.new(frame, "Status: Stopped", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 100) )
|
|
58
|
-
@info_fields << (@link_size_txt = FXLabel.new(frame, "Links: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 70) )
|
|
59
|
-
@info_fields << (@page_size_txt = FXLabel.new(frame, "Pages: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 70) )
|
|
60
|
-
@info_fields << (@requests_txt = FXLabel.new(frame, "Requests: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 100) )
|
|
61
|
-
|
|
62
|
-
@info_fields.each do |i|
|
|
63
|
-
i.justify = JUSTIFY_LEFT
|
|
64
|
-
end
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
end
|
|
68
|
-
end
|
|
69
|
-
end
|
|
70
|
-
end
|
|
14
|
+
#puts status.to_yaml
|
|
15
|
+
if status.has_key? :engine_status
|
|
16
|
+
case status[:engine_status]
|
|
17
|
+
when CRAWL_NONE
|
|
18
|
+
self.backColor = self.parent.backColor
|
|
19
|
+
@status_txt.text = "Status: Idle"
|
|
20
|
+
when CRAWL_RUNNING
|
|
21
|
+
self.backColor = FXColor::Red
|
|
22
|
+
@status_txt.text = "Status: Running"
|
|
23
|
+
|
|
24
|
+
when CRAWL_PAUSED
|
|
25
|
+
self.backColor = FXColor::Yellow
|
|
26
|
+
@status_txt.text = "Status: Paused"
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
if status.has_key? :link_size
|
|
31
|
+
@link_size_txt.text = "Links: #{status[:link_size]}"
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
if status.has_key? :page_size
|
|
35
|
+
@page_size_txt.text = "Pages: #{status[:page_size]}"
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
if status.has_key? :total_requests
|
|
39
|
+
@requests_txt.text = "Requests: #{status[:total_requests]}"
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def initialize(owner)
|
|
44
|
+
super(owner, :opts => LAYOUT_FILL_X|FRAME_RAISED)
|
|
45
|
+
@info_fields = []
|
|
46
|
+
#frame = FXHorizontalFrame.new(, :opts => LAYOUT_FILL_Y, :padding => 0)
|
|
47
|
+
frame = self
|
|
48
|
+
@info_fields << ( @status_txt = FXLabel.new(frame, "Status: Stopped", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 100) )
|
|
49
|
+
@info_fields << (@link_size_txt = FXLabel.new(frame, "Links: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 70) )
|
|
50
|
+
@info_fields << (@page_size_txt = FXLabel.new(frame, "Pages: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 70) )
|
|
51
|
+
@info_fields << (@requests_txt = FXLabel.new(frame, "Requests: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 100) )
|
|
52
|
+
|
|
53
|
+
@info_fields.each do |i|
|
|
54
|
+
i.justify = JUSTIFY_LEFT
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
71
62
|
end
|
data/plugins/crawler/lib/bags.rb
CHANGED
|
@@ -1,29 +1,20 @@
|
|
|
1
|
-
#.
|
|
2
|
-
# bags.rb
|
|
3
|
-
#.
|
|
4
|
-
# Copyright 2014 by siberas, http://www.siberas.de
|
|
5
|
-
# This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
|
|
6
|
-
# WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
|
|
7
|
-
# WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
|
8
|
-
# You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
9
|
-
|
|
10
1
|
# @private
|
|
11
|
-
module Watobo#:nodoc: all
|
|
12
|
-
module Crawler
|
|
13
|
-
class PageBag
|
|
14
|
-
attr :page, :depth
|
|
15
|
-
def initialize(page, depth)
|
|
16
|
-
@page = page
|
|
17
|
-
@depth = depth
|
|
18
|
-
end
|
|
19
|
-
end
|
|
20
|
-
|
|
21
|
-
class LinkBag
|
|
22
|
-
attr :link, :depth
|
|
23
|
-
def initialize(link, depth)
|
|
24
|
-
@link = link
|
|
25
|
-
@depth = depth
|
|
26
|
-
end
|
|
27
|
-
end
|
|
28
|
-
end
|
|
2
|
+
module Watobo#:nodoc: all
|
|
3
|
+
module Crawler
|
|
4
|
+
class PageBag
|
|
5
|
+
attr :page, :depth
|
|
6
|
+
def initialize(page, depth)
|
|
7
|
+
@page = page
|
|
8
|
+
@depth = depth
|
|
9
|
+
end
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
class LinkBag
|
|
13
|
+
attr :link, :depth
|
|
14
|
+
def initialize(link, depth)
|
|
15
|
+
@link = link
|
|
16
|
+
@depth = depth
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
29
20
|
end
|
|
@@ -1,22 +1,13 @@
|
|
|
1
|
-
#.
|
|
2
|
-
# constants.rb
|
|
3
|
-
#.
|
|
4
|
-
# Copyright 2014 by siberas, http://www.siberas.de
|
|
5
|
-
# This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
|
|
6
|
-
# WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
|
|
7
|
-
# WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
|
8
|
-
# You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
9
|
-
|
|
10
1
|
# @private
|
|
11
|
-
module Watobo#:nodoc: all
|
|
12
|
-
module Plugin
|
|
13
|
-
module Crawler
|
|
14
|
-
module Constants
|
|
15
|
-
CRAWL_NONE = 0x00
|
|
16
|
-
CRAWL_RUNNING = 0x01
|
|
17
|
-
CRAWL_PAUSED = 0x02
|
|
18
|
-
|
|
19
|
-
end
|
|
20
|
-
end
|
|
21
|
-
end
|
|
2
|
+
module Watobo#:nodoc: all
|
|
3
|
+
module Plugin
|
|
4
|
+
module Crawler
|
|
5
|
+
module Constants
|
|
6
|
+
CRAWL_NONE = 0x00
|
|
7
|
+
CRAWL_RUNNING = 0x01
|
|
8
|
+
CRAWL_PAUSED = 0x02
|
|
9
|
+
|
|
10
|
+
end
|
|
11
|
+
end
|
|
12
|
+
end
|
|
22
13
|
end
|
|
@@ -1,517 +1,508 @@
|
|
|
1
|
-
#.
|
|
2
|
-
# engine.rb
|
|
3
|
-
#.
|
|
4
|
-
# Copyright 2014 by siberas, http://www.siberas.de
|
|
5
|
-
# This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
|
|
6
|
-
# WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
|
|
7
|
-
# WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
|
8
|
-
# You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
9
|
-
|
|
10
1
|
# @private
|
|
11
|
-
module Watobo#:nodoc: all
|
|
12
|
-
module Crawler
|
|
13
|
-
|
|
14
|
-
class Agent < Mechanize
|
|
15
|
-
|
|
16
|
-
def initialize(opts)
|
|
17
|
-
super()
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
self.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
|
21
|
-
self.ignore_bad_chunking = true
|
|
22
|
-
self.keep_alive = false
|
|
23
|
-
|
|
24
|
-
self.user_agent = opts[:user_agent] if opts.has_key?(:user_agent)
|
|
25
|
-
|
|
26
|
-
if opts.has_key? :username and opts.has_key? :password
|
|
27
|
-
unless opts[:username].empty? and opts[:password].empty?
|
|
28
|
-
|
|
29
|
-
user = opts[:username]
|
|
30
|
-
pw = opts[:password]
|
|
31
|
-
uri = opts[:auth_uri]
|
|
32
|
-
# puts "Got Credentials for #{uri}: #{user} / #{pw}"
|
|
33
|
-
self.add_auth(uri, user , pw )
|
|
34
|
-
# TODO: remove this workaround for a Mechanize Bug (#243)
|
|
35
|
-
p = self.get uri
|
|
36
|
-
end
|
|
37
|
-
end
|
|
38
|
-
|
|
39
|
-
if ( opts.has_key? :proxy_host ) && ( opts.has_key? :proxy_port )
|
|
40
|
-
self.set_proxy( opts[:proxy_host], opts[:proxy_port] )
|
|
41
|
-
end
|
|
42
|
-
|
|
43
|
-
if opts.has_key? :pre_connect_hook
|
|
44
|
-
self.pre_connect_hooks << opts[:pre_connect_hook] if opts[:pre_connect_hook].respond_to? :call
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
unless opts[:cookie_jar].nil?
|
|
48
|
-
clean_jar = Mechanize::CookieJar.new
|
|
49
|
-
opts[:cookie_jar].each{ |cookie|
|
|
50
|
-
clean_jar.add! cookie
|
|
51
|
-
}
|
|
52
|
-
self.cookie_jar = clean_jar
|
|
53
|
-
end
|
|
54
|
-
|
|
55
|
-
end
|
|
56
|
-
|
|
57
|
-
end
|
|
58
|
-
|
|
59
|
-
class Engine
|
|
60
|
-
include Watobo::Plugin::Crawler::Constants
|
|
61
|
-
|
|
62
|
-
def subscribe(event, &callback)
|
|
63
|
-
(@event_dispatcher_listeners[event] ||= []) << callback
|
|
64
|
-
end
|
|
65
|
-
|
|
66
|
-
def clearEvents(event)
|
|
67
|
-
@event_dispatcher_listeners[event] ||= []
|
|
68
|
-
@event_dispatcher_listeners[event].clear
|
|
69
|
-
end
|
|
70
|
-
|
|
71
|
-
def notify(event, *args)
|
|
72
|
-
if @event_dispatcher_listeners[event]
|
|
73
|
-
# puts "NOTIFY: #{self}(:#{event}) [#{@event_dispatcher_listeners[event].length}]" if $DEBUG
|
|
74
|
-
@event_dispatcher_listeners[event].each do |m|
|
|
75
|
-
m.call(*args) if m.respond_to? :call
|
|
76
|
-
end
|
|
77
|
-
end
|
|
78
|
-
end
|
|
79
|
-
|
|
80
|
-
def settings
|
|
81
|
-
@opts
|
|
82
|
-
end
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
def get_page(url, opts={})
|
|
87
|
-
ro = {}.update @opts
|
|
88
|
-
ro.update opts
|
|
89
|
-
agent = Crawler::Agent.new(ro)
|
|
90
|
-
page = nil
|
|
91
|
-
page = agent.get url
|
|
92
|
-
return agent, page
|
|
93
|
-
end
|
|
94
|
-
|
|
95
|
-
def initialize(opts={})
|
|
96
|
-
@event_dispatcher_listeners = Hash.new
|
|
97
|
-
@status_lock = Mutex.new
|
|
98
|
-
|
|
99
|
-
@opts = {
|
|
100
|
-
:submit_forms => true,
|
|
101
|
-
:max_depth => 5,
|
|
102
|
-
:max_repeat => 20,
|
|
103
|
-
:max_threads => 4,
|
|
104
|
-
:user_agent => "watobo-crawler",
|
|
105
|
-
:proxy_host => '127.0.0.1',
|
|
106
|
-
:proxy_port => Watobo::Conf::Interceptor.port,
|
|
107
|
-
:delay => 0,
|
|
108
|
-
:head_request_pattern => '(pdf|swf|doc|flv|jpg|png|gif)',
|
|
109
|
-
:allowed_hosts => [], # regex's
|
|
110
|
-
:allowed_urls => [], # regex's
|
|
111
|
-
:excluded_urls => ["logout"], # regex's
|
|
112
|
-
:excluded_fields => ["userid","username","password"], # regex's'
|
|
113
|
-
:excluded_form_names => [], # regex's'
|
|
114
|
-
:root_path => "", # regex
|
|
115
|
-
:username => "",
|
|
116
|
-
:password => "",
|
|
117
|
-
:auth_uri => nil,
|
|
118
|
-
:auth_domain => "", # for ntlm auth
|
|
119
|
-
:cookie_jar => nil
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
@opts.update opts
|
|
123
|
-
@opts[:head_request_pattern] = '' if @opts[:head_request_pattern].nil?
|
|
124
|
-
|
|
125
|
-
@stats = {
|
|
126
|
-
:total_requests => 0
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
@link_keys = Hash.new
|
|
130
|
-
@link_counts = Hash.new
|
|
131
|
-
|
|
132
|
-
@form_keys = Hash.new
|
|
133
|
-
@form_counts = Hash.new
|
|
134
|
-
|
|
135
|
-
end
|
|
136
|
-
|
|
137
|
-
def pause
|
|
138
|
-
false
|
|
139
|
-
end
|
|
140
|
-
|
|
141
|
-
def cancel
|
|
142
|
-
puts "[CRAWLER] - CANCEL!!"
|
|
143
|
-
#@status_lock.synchronize do
|
|
144
|
-
# @engine_status = CRAWL_NONE
|
|
2
|
+
module Watobo#:nodoc: all
|
|
3
|
+
module Crawler
|
|
4
|
+
|
|
5
|
+
class Agent < Mechanize
|
|
6
|
+
|
|
7
|
+
def initialize(opts)
|
|
8
|
+
super()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
self.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
|
12
|
+
self.ignore_bad_chunking = true
|
|
13
|
+
self.keep_alive = false
|
|
14
|
+
|
|
15
|
+
self.user_agent = opts[:user_agent] if opts.has_key?(:user_agent)
|
|
16
|
+
|
|
17
|
+
if opts.has_key? :username and opts.has_key? :password
|
|
18
|
+
unless opts[:username].empty? and opts[:password].empty?
|
|
19
|
+
|
|
20
|
+
user = opts[:username]
|
|
21
|
+
pw = opts[:password]
|
|
22
|
+
uri = opts[:auth_uri]
|
|
23
|
+
# puts "Got Credentials for #{uri}: #{user} / #{pw}"
|
|
24
|
+
self.add_auth(uri, user , pw )
|
|
25
|
+
# TODO: remove this workaround for a Mechanize Bug (#243)
|
|
26
|
+
p = self.get uri
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
if ( opts.has_key? :proxy_host ) && ( opts.has_key? :proxy_port )
|
|
31
|
+
self.set_proxy( opts[:proxy_host], opts[:proxy_port] )
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
if opts.has_key? :pre_connect_hook
|
|
35
|
+
self.pre_connect_hooks << opts[:pre_connect_hook] if opts[:pre_connect_hook].respond_to? :call
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
unless opts[:cookie_jar].nil?
|
|
39
|
+
clean_jar = Mechanize::CookieJar.new
|
|
40
|
+
opts[:cookie_jar].each{ |cookie|
|
|
41
|
+
clean_jar.add! cookie
|
|
42
|
+
}
|
|
43
|
+
self.cookie_jar = clean_jar
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
class Engine
|
|
51
|
+
include Watobo::Plugin::Crawler::Constants
|
|
52
|
+
|
|
53
|
+
def subscribe(event, &callback)
|
|
54
|
+
(@event_dispatcher_listeners[event] ||= []) << callback
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def clearEvents(event)
|
|
58
|
+
@event_dispatcher_listeners[event] ||= []
|
|
59
|
+
@event_dispatcher_listeners[event].clear
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def notify(event, *args)
|
|
63
|
+
if @event_dispatcher_listeners[event]
|
|
64
|
+
# puts "NOTIFY: #{self}(:#{event}) [#{@event_dispatcher_listeners[event].length}]" if $DEBUG
|
|
65
|
+
@event_dispatcher_listeners[event].each do |m|
|
|
66
|
+
m.call(*args) if m.respond_to? :call
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def settings
|
|
72
|
+
@opts
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_page(url, opts={})
|
|
78
|
+
ro = {}.update @opts
|
|
79
|
+
ro.update opts
|
|
80
|
+
agent = Crawler::Agent.new(ro)
|
|
81
|
+
page = nil
|
|
82
|
+
page = agent.get url
|
|
83
|
+
return agent, page
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def initialize(opts={})
|
|
87
|
+
@event_dispatcher_listeners = Hash.new
|
|
88
|
+
@status_lock = Mutex.new
|
|
89
|
+
|
|
90
|
+
@opts = {
|
|
91
|
+
:submit_forms => true,
|
|
92
|
+
:max_depth => 5,
|
|
93
|
+
:max_repeat => 20,
|
|
94
|
+
:max_threads => 4,
|
|
95
|
+
:user_agent => "watobo-crawler",
|
|
96
|
+
:proxy_host => '127.0.0.1',
|
|
97
|
+
:proxy_port => Watobo::Conf::Interceptor.port,
|
|
98
|
+
:delay => 0,
|
|
99
|
+
:head_request_pattern => '(pdf|swf|doc|flv|jpg|png|gif)',
|
|
100
|
+
:allowed_hosts => [], # regex's
|
|
101
|
+
:allowed_urls => [], # regex's
|
|
102
|
+
:excluded_urls => ["logout"], # regex's
|
|
103
|
+
:excluded_fields => ["userid","username","password"], # regex's'
|
|
104
|
+
:excluded_form_names => [], # regex's'
|
|
105
|
+
:root_path => "", # regex
|
|
106
|
+
:username => "",
|
|
107
|
+
:password => "",
|
|
108
|
+
:auth_uri => nil,
|
|
109
|
+
:auth_domain => "", # for ntlm auth
|
|
110
|
+
:cookie_jar => nil
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
@opts.update opts
|
|
114
|
+
@opts[:head_request_pattern] = '' if @opts[:head_request_pattern].nil?
|
|
115
|
+
|
|
116
|
+
@stats = {
|
|
117
|
+
:total_requests => 0
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
@link_keys = Hash.new
|
|
121
|
+
@link_counts = Hash.new
|
|
122
|
+
|
|
123
|
+
@form_keys = Hash.new
|
|
124
|
+
@form_counts = Hash.new
|
|
125
|
+
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def pause
|
|
129
|
+
false
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def cancel
|
|
133
|
+
puts "[CRAWLER] - CANCEL!!"
|
|
134
|
+
#@status_lock.synchronize do
|
|
135
|
+
# @engine_status = CRAWL_NONE
|
|
145
136
|
#end
|
|
146
|
-
Watobo::Crawler::Status.engine = CRAWL_NONE
|
|
147
|
-
@grabber_threads.each do |gt|
|
|
148
|
-
puts "Killing Thread #{gt}"
|
|
149
|
-
gt.kill
|
|
150
|
-
gt.raise "CANCEL"
|
|
151
|
-
end
|
|
152
|
-
@grabber_threads.each{|t| t.join }
|
|
153
|
-
|
|
154
|
-
@link_queue.clear
|
|
155
|
-
@page_queue.clear
|
|
156
|
-
@grabber_threads.clear
|
|
157
|
-
@link_keys.clear
|
|
158
|
-
@link_counts.clear
|
|
159
|
-
|
|
160
|
-
@form_keys.clear
|
|
161
|
-
@form_counts.clear
|
|
162
|
-
|
|
163
|
-
#notify( :update_status, current_status )
|
|
164
|
-
puts "CANCELED - CANCELED"
|
|
165
|
-
# exit
|
|
166
|
-
end
|
|
167
|
-
|
|
137
|
+
Watobo::Crawler::Status.engine = CRAWL_NONE
|
|
138
|
+
@grabber_threads.each do |gt|
|
|
139
|
+
puts "Killing Thread #{gt}"
|
|
140
|
+
gt.kill
|
|
141
|
+
gt.raise "CANCEL"
|
|
142
|
+
end
|
|
143
|
+
@grabber_threads.each{|t| t.join }
|
|
144
|
+
|
|
145
|
+
@link_queue.clear
|
|
146
|
+
@page_queue.clear
|
|
147
|
+
@grabber_threads.clear
|
|
148
|
+
@link_keys.clear
|
|
149
|
+
@link_counts.clear
|
|
150
|
+
|
|
151
|
+
@form_keys.clear
|
|
152
|
+
@form_counts.clear
|
|
153
|
+
|
|
154
|
+
#notify( :update_status, current_status )
|
|
155
|
+
puts "CANCELED - CANCELED"
|
|
156
|
+
# exit
|
|
157
|
+
end
|
|
158
|
+
|
|
168
159
|
def run(url, opts={})
|
|
169
160
|
#engine_status = CRAWL_RUNNING
|
|
170
161
|
Watobo::Crawler::Status.reset
|
|
171
162
|
Watobo::Crawler::Status.engine = CRAWL_RUNNING
|
|
172
|
-
|
|
173
|
-
@opts.update opts
|
|
163
|
+
|
|
164
|
+
@opts.update opts
|
|
174
165
|
@opts[:head_request_pattern] = '' if @opts[:head_request_pattern].nil?
|
|
175
166
|
|
|
176
167
|
puts "crawler settings:"
|
|
177
168
|
puts @opts.to_json
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
@link_queue = Queue.new
|
|
181
|
-
@page_queue = Queue.new
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
@link_queue = Queue.new
|
|
172
|
+
@page_queue = Queue.new
|
|
173
|
+
|
|
174
|
+
@link_keys = Hash.new
|
|
175
|
+
@link_counts = Hash.new
|
|
182
176
|
|
|
183
|
-
@
|
|
184
|
-
@
|
|
185
|
-
|
|
186
|
-
@
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
start_link
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
process_links(pagebag)
|
|
213
|
-
|
|
214
|
-
process_forms(pagebag)
|
|
177
|
+
@form_keys = Hash.new
|
|
178
|
+
@form_counts = Hash.new
|
|
179
|
+
|
|
180
|
+
@skipped_sites = Hash.new
|
|
181
|
+
|
|
182
|
+
@grabber_threads = []
|
|
183
|
+
start_link = URI.parse url
|
|
184
|
+
return false if start_link.host.nil?
|
|
185
|
+
|
|
186
|
+
allow_host(start_link)
|
|
187
|
+
|
|
188
|
+
@link_queue.enq LinkBag.new(start_link, 0)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
notify(:log, "Crawling #{url} started ..." )
|
|
192
|
+
|
|
193
|
+
@opts[:max_threads].times do |i|
|
|
194
|
+
g = Grabber.new(@link_queue, @page_queue, @opts )
|
|
195
|
+
@grabber_threads << g.run
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
puts "* startet #{@grabber_threads.length} grabbers"
|
|
199
|
+
|
|
200
|
+
loop do
|
|
201
|
+
pagebag = @page_queue.deq
|
|
202
|
+
|
|
203
|
+
process_links(pagebag)
|
|
204
|
+
|
|
205
|
+
process_forms(pagebag)
|
|
215
206
|
#@stats[:total_requests] += 1 unless pagebag.nil?
|
|
216
207
|
Watobo::Crawler::Status.inc_requests() unless pagebag.nil?
|
|
217
208
|
Watobo::Crawler::Status.page_size= @page_queue.size
|
|
218
|
-
Watobo::Crawler::Status.link_size= @link_queue.size
|
|
219
|
-
|
|
220
|
-
puts "Links/Pages: #{@link_queue.size}/#{@page_queue.size}"
|
|
221
|
-
#notify( :update_status, current_status )
|
|
222
|
-
# if @link_queue.empty? and @page_queue.empty?
|
|
223
|
-
if @page_queue.empty?
|
|
224
|
-
# if page_queue is empty wait for all grabber threads finishing the link_queue
|
|
225
|
-
until @link_queue.num_waiting == @grabber_threads.length
|
|
226
|
-
Thread.pass
|
|
227
|
-
end
|
|
228
|
-
# when the link_queue is finished check the page_queue. Crawling is finished if page_queue is empty too.
|
|
229
|
-
if @page_queue.empty?
|
|
230
|
-
@grabber_threads.each { |t| t.kill }
|
|
231
|
-
puts "Finished Crawling"
|
|
209
|
+
Watobo::Crawler::Status.link_size= @link_queue.size
|
|
210
|
+
|
|
211
|
+
puts "Links/Pages: #{@link_queue.size}/#{@page_queue.size}"
|
|
212
|
+
#notify( :update_status, current_status )
|
|
213
|
+
# if @link_queue.empty? and @page_queue.empty?
|
|
214
|
+
if @page_queue.empty?
|
|
215
|
+
# if page_queue is empty wait for all grabber threads finishing the link_queue
|
|
216
|
+
until @link_queue.num_waiting == @grabber_threads.length
|
|
217
|
+
Thread.pass
|
|
218
|
+
end
|
|
219
|
+
# when the link_queue is finished check the page_queue. Crawling is finished if page_queue is empty too.
|
|
220
|
+
if @page_queue.empty?
|
|
221
|
+
@grabber_threads.each { |t| t.kill }
|
|
222
|
+
puts "Finished Crawling"
|
|
232
223
|
#@status_lock.synchronize{ @engine_status = CRAWL_NONE }
|
|
233
224
|
Watobo::Crawler::Status.engine = CRAWL_NONE
|
|
234
|
-
|
|
235
|
-
notify(:log, "Crawling finished")
|
|
236
|
-
#notify( :update_status, current_status )
|
|
237
|
-
break
|
|
238
|
-
|
|
239
|
-
end
|
|
240
|
-
end
|
|
241
|
-
|
|
242
|
-
end
|
|
243
|
-
|
|
244
|
-
end
|
|
245
|
-
|
|
246
|
-
private
|
|
247
|
-
|
|
248
|
-
def current_status
|
|
225
|
+
|
|
226
|
+
notify(:log, "Crawling finished")
|
|
227
|
+
#notify( :update_status, current_status )
|
|
228
|
+
break
|
|
229
|
+
|
|
230
|
+
end
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
private
|
|
238
|
+
|
|
239
|
+
def current_status
|
|
249
240
|
{
|
|
250
|
-
:engine_status => @engine_status,
|
|
251
|
-
:link_size => @link_queue.size,
|
|
252
|
-
:page_size => @page_queue.size
|
|
253
|
-
}.update @stats
|
|
254
|
-
|
|
255
|
-
end
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
def allow_host(uri)
|
|
259
|
-
if uri.is_a? URI
|
|
260
|
-
site = uri.site.to_s
|
|
261
|
-
# puts "Valid Site: #{site}"
|
|
262
|
-
ah = allowed_hosts
|
|
263
|
-
ah << site
|
|
264
|
-
end
|
|
265
|
-
end
|
|
266
|
-
|
|
267
|
-
def process_forms(pagebag)
|
|
268
|
-
return false unless pagebag.respond_to? :page
|
|
269
|
-
page=pagebag.page
|
|
270
|
-
return false unless page.respond_to? :forms
|
|
271
|
-
page.forms.each do |f|
|
|
272
|
-
|
|
273
|
-
action = page.uri.merge f.action unless f.action =~ /^http/
|
|
274
|
-
f.action = action.to_s
|
|
275
|
-
|
|
276
|
-
if send_form? f
|
|
277
|
-
# puts "SUBMIT FORM: #{f.action}"
|
|
278
|
-
send_form(f, pagebag.depth)
|
|
279
|
-
end
|
|
280
|
-
end
|
|
281
|
-
end
|
|
282
|
-
|
|
283
|
-
def process_links(pagebag)
|
|
284
|
-
return false unless pagebag.respond_to? :page
|
|
285
|
-
page = pagebag.page
|
|
286
|
-
return false unless page.respond_to? :links
|
|
287
|
-
|
|
288
|
-
page.links.each do |l|
|
|
289
|
-
begin
|
|
241
|
+
:engine_status => @engine_status,
|
|
242
|
+
:link_size => @link_queue.size,
|
|
243
|
+
:page_size => @page_queue.size
|
|
244
|
+
}.update @stats
|
|
245
|
+
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def allow_host(uri)
|
|
250
|
+
if uri.is_a? URI
|
|
251
|
+
site = uri.site.to_s
|
|
252
|
+
# puts "Valid Site: #{site}"
|
|
253
|
+
ah = allowed_hosts
|
|
254
|
+
ah << site
|
|
255
|
+
end
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
def process_forms(pagebag)
|
|
259
|
+
return false unless pagebag.respond_to? :page
|
|
260
|
+
page=pagebag.page
|
|
261
|
+
return false unless page.respond_to? :forms
|
|
262
|
+
page.forms.each do |f|
|
|
263
|
+
|
|
264
|
+
action = page.uri.merge f.action unless f.action =~ /^http/
|
|
265
|
+
f.action = action.to_s
|
|
266
|
+
|
|
267
|
+
if send_form? f
|
|
268
|
+
# puts "SUBMIT FORM: #{f.action}"
|
|
269
|
+
send_form(f, pagebag.depth)
|
|
270
|
+
end
|
|
271
|
+
end
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
def process_links(pagebag)
|
|
275
|
+
return false unless pagebag.respond_to? :page
|
|
276
|
+
page = pagebag.page
|
|
277
|
+
return false unless page.respond_to? :links
|
|
278
|
+
|
|
279
|
+
page.links.each do |l|
|
|
280
|
+
begin
|
|
290
281
|
link = l
|
|
291
282
|
next if l.href.nil?
|
|
292
|
-
|
|
293
|
-
link = page.uri.merge l.uri unless l.href =~ /^http/
|
|
294
|
-
# puts "FOLLOW LINK #{link} ?"
|
|
295
|
-
if follow_link? link
|
|
296
|
-
# puts ">> OK"
|
|
297
|
-
submit_link(link, pagebag.depth)
|
|
298
|
-
else
|
|
299
|
-
# puts ">> NO"
|
|
300
|
-
end
|
|
301
|
-
rescue => bang
|
|
283
|
+
|
|
284
|
+
link = page.uri.merge l.uri unless l.href =~ /^http/
|
|
285
|
+
# puts "FOLLOW LINK #{link} ?"
|
|
286
|
+
if follow_link? link
|
|
287
|
+
# puts ">> OK"
|
|
288
|
+
submit_link(link, pagebag.depth)
|
|
289
|
+
else
|
|
290
|
+
# puts ">> NO"
|
|
291
|
+
end
|
|
292
|
+
rescue => bang
|
|
302
293
|
puts bang
|
|
303
|
-
puts bang.backtrace if $DEBUG
|
|
304
|
-
end
|
|
305
|
-
end
|
|
306
|
-
|
|
307
|
-
end
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
def submit_link(link, depth)
|
|
311
|
-
# @link_keys[link_key(link)] = link
|
|
312
|
-
|
|
313
|
-
clk = link_key(link, :clear_values => true)
|
|
314
|
-
@link_counts[clk] ||= 0
|
|
315
|
-
@link_counts[clk] += 1
|
|
316
|
-
lk = link_key(link)
|
|
317
|
-
return false if @link_keys.has_key? lk
|
|
318
|
-
@link_keys[lk] = nil
|
|
319
|
-
if @link_counts[clk] < @opts[:max_repeat]
|
|
320
|
-
@link_queue.enq LinkBag.new(link, depth)
|
|
321
|
-
else
|
|
322
|
-
puts "! MAX REPEAT !\nSkipped link #{link}" if $DEBUG
|
|
323
|
-
end
|
|
324
|
-
end
|
|
325
|
-
|
|
326
|
-
def form_key(form, opts={} )
|
|
327
|
-
o = { :clear_values => false }
|
|
328
|
-
o.update opts
|
|
329
|
-
|
|
330
|
-
fp = "#{form.action}"
|
|
331
|
-
fp << form.method
|
|
332
|
-
if form.request_data =~ /=/
|
|
333
|
-
data = form.request_data.split("&").sort.join("&")
|
|
334
|
-
if o[:clear_values]
|
|
335
|
-
fp << data.gsub(/=[^&]*/,'=')
|
|
336
|
-
else
|
|
337
|
-
fp << data
|
|
338
|
-
end
|
|
339
|
-
end
|
|
340
|
-
fkey = Digest::MD5.hexdigest fp
|
|
341
|
-
fkey
|
|
342
|
-
end
|
|
343
|
-
|
|
344
|
-
def send_form(form, depth)
|
|
345
|
-
return false if @engine_status == CRAWL_NONE
|
|
346
|
-
cfk = form_key(form, :clear_values => true)
|
|
347
|
-
@form_counts[cfk] ||= 0
|
|
348
|
-
@form_counts[cfk] += 1
|
|
349
|
-
|
|
350
|
-
# @form_keys[form_key(form)] = form
|
|
351
|
-
fk = form_key(form)
|
|
352
|
-
return false if @form_keys.has_key? fk
|
|
353
|
-
@form_keys[fk] = nil
|
|
354
|
-
begin
|
|
355
|
-
if @form_counts[cfk] < @opts[:max_repeat]
|
|
356
|
-
if form.buttons.length > 0
|
|
357
|
-
p = form.click_button
|
|
358
|
-
else
|
|
359
|
-
p = form.submit()
|
|
360
|
-
end
|
|
361
|
-
puts p.class
|
|
362
|
-
@page_queue.enq PageBag.new(p, depth+1)
|
|
363
|
-
else
|
|
364
|
-
puts "! MAX REPEAT !\nSkipped Form #{form.action}"
|
|
365
|
-
end
|
|
366
|
-
rescue => bang
|
|
367
|
-
puts bang
|
|
368
|
-
puts bang.backtrace
|
|
369
|
-
end
|
|
370
|
-
end
|
|
371
|
-
|
|
372
|
-
def send_form?(form)
|
|
373
|
-
# puts "SEND FORM?"
|
|
374
|
-
return false unless engine_running?
|
|
375
|
-
return false unless @opts[:submit_forms] == true
|
|
376
|
-
# puts "> submit_forms"
|
|
377
|
-
return false unless allowed? form.action
|
|
378
|
-
#puts "> allowed"
|
|
379
|
-
return false unless fields_allowed? form
|
|
380
|
-
#puts "> fields allowed"
|
|
381
|
-
return false if form_sent? form
|
|
382
|
-
# puts "> form not sent"
|
|
383
|
-
return true
|
|
384
|
-
end
|
|
385
|
-
|
|
386
|
-
def follow_link?(link)
|
|
387
|
-
return false unless allowed? link
|
|
388
|
-
return false if link_is_followed? link
|
|
389
|
-
return true
|
|
390
|
-
end
|
|
391
|
-
|
|
392
|
-
def host_allowed?(uri)
|
|
393
|
-
#puts "ALLOWED HOSTS =>"
|
|
394
|
-
#puts allowed_hosts
|
|
395
|
-
#puts "---"
|
|
396
|
-
# puts "Host Allowed?"
|
|
397
|
-
ah = allowed_hosts
|
|
398
|
-
# puts ah.class
|
|
399
|
-
#puts ah
|
|
400
|
-
return false if ah.empty?
|
|
401
|
-
ahc = ah.select{ |h| uri.site =~ /^#{h}$/ }.length
|
|
402
|
-
if ahc > 0
|
|
403
|
-
# puts "> Host IS allowed!"
|
|
404
|
-
return true
|
|
405
|
-
end
|
|
406
|
-
# puts "> Host is NOT allowed!"
|
|
407
|
-
return false
|
|
408
|
-
end
|
|
409
|
-
|
|
410
|
-
def url_allowed?(uri)
|
|
411
|
-
# puts "* excluded_urls"
|
|
412
|
-
# puts exluded_urls
|
|
413
|
-
return false if excluded_urls.select{ |url| uri.path =~ /#{url}/ }.length > 0
|
|
414
|
-
# puts "* allowed_urls"
|
|
415
|
-
# puts allowed_urls
|
|
416
|
-
return true if allowed_urls.empty?
|
|
417
|
-
return true if allowed_urls.select{ |url| uri.path =~ /#{url}/ }.length > 0
|
|
418
|
-
# puts "> URL is NOT allowed"
|
|
419
|
-
return false
|
|
420
|
-
end
|
|
421
|
-
|
|
422
|
-
def path_allowed?(uri)
|
|
423
|
-
return true if root_path.nil?
|
|
424
|
-
return true if root_path.empty?
|
|
425
|
-
return true if uri.path =~ /^#{root_path}/
|
|
426
|
-
# puts "> PATH is NOT ALLOWED"
|
|
427
|
-
return false
|
|
428
|
-
end
|
|
429
|
-
|
|
430
|
-
def cleanup_uri(obj)
|
|
431
|
-
uri = nil
|
|
432
|
-
uri = obj.uri if obj.respond_to? :uri
|
|
433
|
-
uri = URI.parse(obj) if obj.is_a? String
|
|
434
|
-
uri = obj if obj.is_a? URI::HTTP
|
|
435
|
-
uri
|
|
436
|
-
end
|
|
437
|
-
|
|
438
|
-
def allowed?(link)
|
|
439
|
-
valid = false
|
|
440
|
-
# need to handle different link objects, Mechanize::Page::Link and URIs
|
|
441
|
-
uri = nil
|
|
442
|
-
uri = link.uri if link.respond_to? :uri
|
|
443
|
-
uri = URI.parse(link) if link.is_a? String
|
|
444
|
-
uri = link if link.is_a? URI::HTTP
|
|
445
|
-
|
|
446
|
-
return false if uri.nil?
|
|
447
|
-
|
|
448
|
-
host_allowed?(uri) &&
|
|
449
|
-
url_allowed?(uri) &&
|
|
450
|
-
path_allowed?(uri)
|
|
451
|
-
end
|
|
452
|
-
|
|
453
|
-
def form_sent?(form)
|
|
454
|
-
|
|
455
|
-
@form_keys.has_key? form_key(form)
|
|
456
|
-
end
|
|
457
|
-
|
|
458
|
-
def link_key(link, opts={})
|
|
459
|
-
o = { :clear_values => false }
|
|
460
|
-
o.update opts
|
|
461
|
-
|
|
462
|
-
uri = cleanup_uri(link)
|
|
463
|
-
|
|
464
|
-
query_sorted = ""
|
|
465
|
-
query_sorted = uri.query.split("&").sort.join("&") unless uri.query.nil?
|
|
466
|
-
|
|
467
|
-
key = ""
|
|
468
|
-
key << uri.scheme
|
|
469
|
-
key << uri.site
|
|
470
|
-
key << uri.path
|
|
471
|
-
key << query_sorted
|
|
472
|
-
key.gsub!(/=[^&]*/,'=') if o[:clear_values] == true
|
|
473
|
-
|
|
474
|
-
Digest::MD5.hexdigest key
|
|
475
|
-
end
|
|
476
|
-
|
|
477
|
-
def engine_running?
|
|
478
|
-
@status_lock.synchronize do
|
|
479
|
-
return false if @engine_status == CRAWL_NONE
|
|
480
|
-
return true
|
|
481
|
-
end
|
|
482
|
-
end
|
|
483
|
-
|
|
484
|
-
def link_is_followed?(link)
|
|
485
|
-
|
|
486
|
-
return true if @link_keys.has_key? link_key(link)
|
|
487
|
-
|
|
488
|
-
false
|
|
489
|
-
end
|
|
490
|
-
|
|
491
|
-
def fields_allowed?(form)
|
|
492
|
-
form.fields.each do |f|
|
|
493
|
-
excluded_fields.each do |ef|
|
|
494
|
-
return false if f.name =~ /#{ef}/
|
|
495
|
-
end
|
|
496
|
-
end
|
|
497
|
-
return true
|
|
498
|
-
end
|
|
499
|
-
|
|
500
|
-
def method_missing(name, *args, &block)
|
|
501
|
-
# puts "* instance method missing (#{name})"
|
|
502
|
-
if name =~ /(.*)=$/
|
|
503
|
-
@opts.has_key? $1.to_sym || super
|
|
504
|
-
@opts[$1.to_sym] = args[0]
|
|
505
|
-
return @opts[$1.to_sym]
|
|
506
|
-
else
|
|
507
|
-
k = name.to_sym
|
|
508
|
-
@opts.has_key? k || super
|
|
509
|
-
# puts "Value Found For #{k.to_yaml}"
|
|
510
|
-
return @opts[k]
|
|
511
|
-
|
|
512
|
-
end
|
|
513
|
-
end
|
|
514
|
-
end
|
|
515
|
-
end
|
|
516
|
-
|
|
517
|
-
end
|
|
294
|
+
puts bang.backtrace if $DEBUG
|
|
295
|
+
end
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def submit_link(link, depth)
|
|
302
|
+
# @link_keys[link_key(link)] = link
|
|
303
|
+
|
|
304
|
+
clk = link_key(link, :clear_values => true)
|
|
305
|
+
@link_counts[clk] ||= 0
|
|
306
|
+
@link_counts[clk] += 1
|
|
307
|
+
lk = link_key(link)
|
|
308
|
+
return false if @link_keys.has_key? lk
|
|
309
|
+
@link_keys[lk] = nil
|
|
310
|
+
if @link_counts[clk] < @opts[:max_repeat]
|
|
311
|
+
@link_queue.enq LinkBag.new(link, depth)
|
|
312
|
+
else
|
|
313
|
+
puts "! MAX REPEAT !\nSkipped link #{link}" if $DEBUG
|
|
314
|
+
end
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
def form_key(form, opts={} )
|
|
318
|
+
o = { :clear_values => false }
|
|
319
|
+
o.update opts
|
|
320
|
+
|
|
321
|
+
fp = "#{form.action}"
|
|
322
|
+
fp << form.method
|
|
323
|
+
if form.request_data =~ /=/
|
|
324
|
+
data = form.request_data.split("&").sort.join("&")
|
|
325
|
+
if o[:clear_values]
|
|
326
|
+
fp << data.gsub(/=[^&]*/,'=')
|
|
327
|
+
else
|
|
328
|
+
fp << data
|
|
329
|
+
end
|
|
330
|
+
end
|
|
331
|
+
fkey = Digest::MD5.hexdigest fp
|
|
332
|
+
fkey
|
|
333
|
+
end
|
|
334
|
+
|
|
335
|
+
def send_form(form, depth)
|
|
336
|
+
return false if @engine_status == CRAWL_NONE
|
|
337
|
+
cfk = form_key(form, :clear_values => true)
|
|
338
|
+
@form_counts[cfk] ||= 0
|
|
339
|
+
@form_counts[cfk] += 1
|
|
340
|
+
|
|
341
|
+
# @form_keys[form_key(form)] = form
|
|
342
|
+
fk = form_key(form)
|
|
343
|
+
return false if @form_keys.has_key? fk
|
|
344
|
+
@form_keys[fk] = nil
|
|
345
|
+
begin
|
|
346
|
+
if @form_counts[cfk] < @opts[:max_repeat]
|
|
347
|
+
if form.buttons.length > 0
|
|
348
|
+
p = form.click_button
|
|
349
|
+
else
|
|
350
|
+
p = form.submit()
|
|
351
|
+
end
|
|
352
|
+
puts p.class
|
|
353
|
+
@page_queue.enq PageBag.new(p, depth+1)
|
|
354
|
+
else
|
|
355
|
+
puts "! MAX REPEAT !\nSkipped Form #{form.action}"
|
|
356
|
+
end
|
|
357
|
+
rescue => bang
|
|
358
|
+
puts bang
|
|
359
|
+
puts bang.backtrace
|
|
360
|
+
end
|
|
361
|
+
end
|
|
362
|
+
|
|
363
|
+
def send_form?(form)
|
|
364
|
+
# puts "SEND FORM?"
|
|
365
|
+
return false unless engine_running?
|
|
366
|
+
return false unless @opts[:submit_forms] == true
|
|
367
|
+
# puts "> submit_forms"
|
|
368
|
+
return false unless allowed? form.action
|
|
369
|
+
#puts "> allowed"
|
|
370
|
+
return false unless fields_allowed? form
|
|
371
|
+
#puts "> fields allowed"
|
|
372
|
+
return false if form_sent? form
|
|
373
|
+
# puts "> form not sent"
|
|
374
|
+
return true
|
|
375
|
+
end
|
|
376
|
+
|
|
377
|
+
def follow_link?(link)
|
|
378
|
+
return false unless allowed? link
|
|
379
|
+
return false if link_is_followed? link
|
|
380
|
+
return true
|
|
381
|
+
end
|
|
382
|
+
|
|
383
|
+
def host_allowed?(uri)
|
|
384
|
+
#puts "ALLOWED HOSTS =>"
|
|
385
|
+
#puts allowed_hosts
|
|
386
|
+
#puts "---"
|
|
387
|
+
# puts "Host Allowed?"
|
|
388
|
+
ah = allowed_hosts
|
|
389
|
+
# puts ah.class
|
|
390
|
+
#puts ah
|
|
391
|
+
return false if ah.empty?
|
|
392
|
+
ahc = ah.select{ |h| uri.site =~ /^#{h}$/ }.length
|
|
393
|
+
if ahc > 0
|
|
394
|
+
# puts "> Host IS allowed!"
|
|
395
|
+
return true
|
|
396
|
+
end
|
|
397
|
+
# puts "> Host is NOT allowed!"
|
|
398
|
+
return false
|
|
399
|
+
end
|
|
400
|
+
|
|
401
|
+
def url_allowed?(uri)
|
|
402
|
+
# puts "* excluded_urls"
|
|
403
|
+
# puts exluded_urls
|
|
404
|
+
return false if excluded_urls.select{ |url| uri.path =~ /#{url}/ }.length > 0
|
|
405
|
+
# puts "* allowed_urls"
|
|
406
|
+
# puts allowed_urls
|
|
407
|
+
return true if allowed_urls.empty?
|
|
408
|
+
return true if allowed_urls.select{ |url| uri.path =~ /#{url}/ }.length > 0
|
|
409
|
+
# puts "> URL is NOT allowed"
|
|
410
|
+
return false
|
|
411
|
+
end
|
|
412
|
+
|
|
413
|
+
def path_allowed?(uri)
|
|
414
|
+
return true if root_path.nil?
|
|
415
|
+
return true if root_path.empty?
|
|
416
|
+
return true if uri.path =~ /^#{root_path}/
|
|
417
|
+
# puts "> PATH is NOT ALLOWED"
|
|
418
|
+
return false
|
|
419
|
+
end
|
|
420
|
+
|
|
421
|
+
def cleanup_uri(obj)
|
|
422
|
+
uri = nil
|
|
423
|
+
uri = obj.uri if obj.respond_to? :uri
|
|
424
|
+
uri = URI.parse(obj) if obj.is_a? String
|
|
425
|
+
uri = obj if obj.is_a? URI::HTTP
|
|
426
|
+
uri
|
|
427
|
+
end
|
|
428
|
+
|
|
429
|
+
def allowed?(link)
|
|
430
|
+
valid = false
|
|
431
|
+
# need to handle different link objects, Mechanize::Page::Link and URIs
|
|
432
|
+
uri = nil
|
|
433
|
+
uri = link.uri if link.respond_to? :uri
|
|
434
|
+
uri = URI.parse(link) if link.is_a? String
|
|
435
|
+
uri = link if link.is_a? URI::HTTP
|
|
436
|
+
|
|
437
|
+
return false if uri.nil?
|
|
438
|
+
|
|
439
|
+
host_allowed?(uri) &&
|
|
440
|
+
url_allowed?(uri) &&
|
|
441
|
+
path_allowed?(uri)
|
|
442
|
+
end
|
|
443
|
+
|
|
444
|
+
def form_sent?(form)
|
|
445
|
+
|
|
446
|
+
@form_keys.has_key? form_key(form)
|
|
447
|
+
end
|
|
448
|
+
|
|
449
|
+
def link_key(link, opts={})
|
|
450
|
+
o = { :clear_values => false }
|
|
451
|
+
o.update opts
|
|
452
|
+
|
|
453
|
+
uri = cleanup_uri(link)
|
|
454
|
+
|
|
455
|
+
query_sorted = ""
|
|
456
|
+
query_sorted = uri.query.split("&").sort.join("&") unless uri.query.nil?
|
|
457
|
+
|
|
458
|
+
key = ""
|
|
459
|
+
key << uri.scheme
|
|
460
|
+
key << uri.site
|
|
461
|
+
key << uri.path
|
|
462
|
+
key << query_sorted
|
|
463
|
+
key.gsub!(/=[^&]*/,'=') if o[:clear_values] == true
|
|
464
|
+
|
|
465
|
+
Digest::MD5.hexdigest key
|
|
466
|
+
end
|
|
467
|
+
|
|
468
|
+
def engine_running?
|
|
469
|
+
@status_lock.synchronize do
|
|
470
|
+
return false if @engine_status == CRAWL_NONE
|
|
471
|
+
return true
|
|
472
|
+
end
|
|
473
|
+
end
|
|
474
|
+
|
|
475
|
+
def link_is_followed?(link)
|
|
476
|
+
|
|
477
|
+
return true if @link_keys.has_key? link_key(link)
|
|
478
|
+
|
|
479
|
+
false
|
|
480
|
+
end
|
|
481
|
+
|
|
482
|
+
def fields_allowed?(form)
|
|
483
|
+
form.fields.each do |f|
|
|
484
|
+
excluded_fields.each do |ef|
|
|
485
|
+
return false if f.name =~ /#{ef}/
|
|
486
|
+
end
|
|
487
|
+
end
|
|
488
|
+
return true
|
|
489
|
+
end
|
|
490
|
+
|
|
491
|
+
def method_missing(name, *args, &block)
|
|
492
|
+
# puts "* instance method missing (#{name})"
|
|
493
|
+
if name =~ /(.*)=$/
|
|
494
|
+
@opts.has_key? $1.to_sym || super
|
|
495
|
+
@opts[$1.to_sym] = args[0]
|
|
496
|
+
return @opts[$1.to_sym]
|
|
497
|
+
else
|
|
498
|
+
k = name.to_sym
|
|
499
|
+
@opts.has_key? k || super
|
|
500
|
+
# puts "Value Found For #{k.to_yaml}"
|
|
501
|
+
return @opts[k]
|
|
502
|
+
|
|
503
|
+
end
|
|
504
|
+
end
|
|
505
|
+
end
|
|
506
|
+
end
|
|
507
|
+
|
|
508
|
+
end
|