watobo 0.9.21 → 0.9.23
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +46 -1
- data/bin/nfq_server.rb +0 -9
- data/bin/watobo_gui.rb +3 -13
- data/custom-views/prettify-json.rb +9 -18
- data/icons/watobo.ico +0 -0
- data/icons/watobo.ico.old +0 -0
- data/lib/watobo.rb +10 -19
- data/lib/watobo/adapters.rb +5 -14
- data/lib/watobo/adapters/data_store.rb +50 -59
- data/lib/watobo/adapters/file/file_store.rb +287 -296
- data/lib/watobo/adapters/file/marshal_store.rb +293 -296
- data/lib/watobo/adapters/session_store.rb +5 -14
- data/lib/watobo/ca.rb +1 -10
- data/lib/watobo/config.rb +197 -206
- data/lib/watobo/constants.rb +0 -9
- data/lib/watobo/core.rb +3 -12
- data/lib/watobo/core/active_check.rb +72 -135
- data/lib/watobo/core/active_checks.rb +49 -58
- data/lib/watobo/core/ca.rb +369 -389
- data/lib/watobo/core/cert_store.rb +34 -43
- data/lib/watobo/core/chat.rb +92 -101
- data/lib/watobo/core/chats.rb +271 -280
- data/lib/watobo/core/client_cert_store.rb +106 -35
- data/lib/watobo/core/conversation.rb +48 -57
- data/lib/watobo/core/cookie.rb +23 -32
- data/lib/watobo/core/egress_handlers.rb +98 -0
- data/lib/watobo/core/finding.rb +66 -75
- data/lib/watobo/core/findings.rb +107 -114
- data/lib/watobo/core/forwarding_proxy.rb +13 -22
- data/lib/watobo/core/fuzz_gen.rb +0 -9
- data/lib/watobo/core/intercept_carver.rb +166 -177
- data/lib/watobo/core/intercept_filter.rb +235 -244
- data/lib/watobo/core/interceptor.rb +98 -107
- data/lib/watobo/core/min_class.rb +4 -13
- data/lib/watobo/core/netfilter_queue.rb +170 -179
- data/lib/watobo/core/ott_cache.rb +132 -141
- data/lib/watobo/core/parameter.rb +43 -52
- data/lib/watobo/core/passive_check.rb +103 -102
- data/lib/watobo/core/passive_checks.rb +48 -57
- data/lib/watobo/core/passive_scanner.rb +54 -55
- data/lib/watobo/core/plugin.rb +11 -20
- data/lib/watobo/core/project.rb +3 -9
- data/lib/watobo/core/proxy.rb +43 -52
- data/lib/watobo/core/request.rb +125 -123
- data/lib/watobo/core/response.rb +44 -53
- data/lib/watobo/core/scanner.rb +0 -9
- data/lib/watobo/core/scanner3.rb +405 -414
- data/lib/watobo/core/scope.rb +83 -92
- data/lib/watobo/core/session.rb +1043 -1026
- data/lib/watobo/core/sid_cache.rb +98 -107
- data/lib/watobo/core/subscriber.rb +25 -34
- data/lib/watobo/defaults.rb +21 -30
- data/lib/watobo/external/diff/lcs.rb +0 -9
- data/lib/watobo/external/diff/lcs/array.rb +0 -9
- data/lib/watobo/external/diff/lcs/block.rb +0 -9
- data/lib/watobo/external/diff/lcs/callbacks.rb +0 -9
- data/lib/watobo/external/diff/lcs/change.rb +0 -9
- data/lib/watobo/external/diff/lcs/hunk.rb +0 -9
- data/lib/watobo/external/diff/lcs/ldiff.rb +0 -9
- data/lib/watobo/external/diff/lcs/string.rb +0 -9
- data/lib/watobo/externals.rb +6 -15
- data/lib/watobo/framework.rb +4 -13
- data/lib/watobo/framework/create_project.rb +60 -69
- data/lib/watobo/framework/init.rb +0 -9
- data/lib/watobo/framework/init_modules.rb +0 -9
- data/lib/watobo/framework/license_text.rb +28 -37
- data/lib/watobo/framework/load_chat.rb +13 -22
- data/lib/watobo/gui.rb +132 -123
- data/lib/watobo/gui/about_watobo.rb +0 -9
- data/lib/watobo/gui/browser_preview.rb +0 -9
- data/lib/watobo/gui/certificate_dialog.rb +0 -9
- data/lib/watobo/gui/chat_diff.rb +0 -9
- data/lib/watobo/gui/chatviewer_frame.rb +73 -72
- data/lib/watobo/gui/checkboxtree.rb +0 -9
- data/lib/watobo/gui/checks_policy_frame.rb +0 -9
- data/lib/watobo/gui/client_cert_dialog.rb +96 -87
- data/lib/watobo/gui/confirm_scan_dialog.rb +0 -9
- data/lib/watobo/gui/conversation_table.rb +158 -164
- data/lib/watobo/gui/conversation_table_ctrl.rb +207 -216
- data/lib/watobo/gui/conversation_table_ctrl2.rb +373 -382
- data/lib/watobo/gui/csrf_token_dialog.rb +0 -9
- data/lib/watobo/gui/custom_viewer.rb +374 -383
- data/lib/watobo/gui/dashboard.rb +296 -303
- data/lib/watobo/gui/define_scope_frame.rb +0 -9
- data/lib/watobo/gui/differ_frame.rb +215 -224
- data/lib/watobo/gui/edit_comment.rb +0 -9
- data/lib/watobo/gui/edit_scope_dialog.rb +0 -9
- data/lib/watobo/gui/export_dialog.rb +104 -113
- data/lib/watobo/gui/finding_info.rb +0 -9
- data/lib/watobo/gui/findings_tree.rb +210 -217
- data/lib/watobo/gui/full_scan_dialog.rb +0 -9
- data/lib/watobo/gui/fuzzer_gui.rb +1295 -1313
- data/lib/watobo/gui/fxsave_thread.rb +14 -0
- data/lib/watobo/gui/goto_url_dialog.rb +70 -79
- data/lib/watobo/gui/hex_viewer.rb +0 -9
- data/lib/watobo/gui/html_viewer.rb +287 -296
- data/lib/watobo/gui/intercept_filter_dialog.rb +188 -197
- data/lib/watobo/gui/interceptor_gui.rb +1041 -1051
- data/lib/watobo/gui/interceptor_settings_dialog.rb +0 -9
- data/lib/watobo/gui/json_viewer.rb +287 -0
- data/lib/watobo/gui/list_box.rb +101 -110
- data/lib/watobo/gui/log_file_viewer.rb +32 -41
- data/lib/watobo/gui/log_viewer.rb +83 -88
- data/lib/watobo/gui/login_wizzard.rb +0 -9
- data/lib/watobo/gui/main_window.rb +587 -618
- data/lib/watobo/gui/manual_request_editor.rb +620 -565
- data/lib/watobo/gui/master_pw_dialog.rb +0 -9
- data/lib/watobo/gui/mixins/gui_settings.rb +29 -38
- data/lib/watobo/gui/page_tree.rb +217 -226
- data/lib/watobo/gui/password_policy_dialog.rb +0 -9
- data/lib/watobo/gui/plugin_board.rb +0 -9
- data/lib/watobo/gui/preferences_dialog.rb +0 -9
- data/lib/watobo/gui/progress_window.rb +17 -27
- data/lib/watobo/gui/project_wizzard.rb +0 -9
- data/lib/watobo/gui/proxy_dialog.rb +1 -10
- data/lib/watobo/gui/quick_scan_dialog.rb +0 -9
- data/lib/watobo/gui/request_builder_frame.rb +102 -111
- data/lib/watobo/gui/request_editor.rb +181 -137
- data/lib/watobo/gui/rewrite_filters_dialog.rb +394 -403
- data/lib/watobo/gui/rewrite_rules_dialog.rb +372 -381
- data/lib/watobo/gui/save_chat_dialog.rb +140 -149
- data/lib/watobo/gui/scanner_settings_dialog.rb +0 -9
- data/lib/watobo/gui/select_chat_dialog.rb +0 -9
- data/lib/watobo/gui/session_management_dialog.rb +0 -9
- data/lib/watobo/gui/sites_tree.rb +0 -9
- data/lib/watobo/gui/status_bar.rb +0 -9
- data/lib/watobo/gui/table_editor.rb +0 -9
- data/lib/watobo/gui/tagless_viewer.rb +0 -9
- data/lib/watobo/gui/templates/plugin.rb +0 -9
- data/lib/watobo/gui/templates/plugin2.rb +92 -100
- data/lib/watobo/gui/templates/plugin_base.rb +144 -153
- data/lib/watobo/gui/text_viewer.rb +0 -9
- data/lib/watobo/gui/transcoder_window.rb +0 -9
- data/lib/watobo/gui/utils/gui_utils.rb +0 -9
- data/lib/watobo/gui/utils/init_icons.rb +86 -95
- data/lib/watobo/gui/utils/load_icons.rb +33 -42
- data/lib/watobo/gui/utils/load_plugins.rb +116 -119
- data/lib/watobo/gui/utils/master_password.rb +68 -77
- data/lib/watobo/gui/utils/save_default_settings.rb +113 -122
- data/lib/watobo/gui/utils/save_project_settings.rb +0 -9
- data/lib/watobo/gui/utils/save_proxy_settings.rb +41 -50
- data/lib/watobo/gui/utils/save_scanner_settings.rb +18 -27
- data/lib/watobo/gui/utils/session_history.rb +112 -121
- data/lib/watobo/gui/workspace_dialog.rb +0 -9
- data/lib/watobo/gui/www_auth_dialog.rb +0 -9
- data/lib/watobo/gui/xml_viewer_frame.rb +0 -9
- data/lib/watobo/http.rb +4 -13
- data/lib/watobo/http/cookies/cookies.rb +26 -35
- data/lib/watobo/http/data/data.rb +45 -54
- data/lib/watobo/http/data/json.rb +47 -55
- data/lib/watobo/http/url/url.rb +38 -47
- data/lib/watobo/http/xml/xml.rb +124 -130
- data/lib/watobo/interceptor.rb +3 -12
- data/lib/watobo/interceptor/proxy.rb +742 -739
- data/lib/watobo/interceptor/transparent.rb +22 -24
- data/lib/watobo/mixins.rb +10 -19
- data/lib/watobo/mixins/check_info.rb +27 -36
- data/lib/watobo/mixins/httpparser.rb +613 -637
- data/lib/watobo/mixins/request_parser.rb +88 -97
- data/lib/watobo/mixins/shapers.rb +515 -529
- data/lib/watobo/mixins/transcoders.rb +3 -11
- data/lib/watobo/parser.rb +1 -10
- data/lib/watobo/parser/html.rb +83 -92
- data/lib/watobo/patch_fxruby_setfocus.rb +26 -0
- data/lib/watobo/sockets.rb +3 -12
- data/lib/watobo/sockets/agent.rb +828 -837
- data/lib/watobo/sockets/client_socket.rb +308 -312
- data/lib/watobo/sockets/connection.rb +401 -410
- data/lib/watobo/sockets/http_socket.rb +11 -13
- data/lib/watobo/sockets/ntlm_auth.rb +129 -138
- data/lib/watobo/utils.rb +10 -19
- data/lib/watobo/utils/check_regex.rb +0 -9
- data/lib/watobo/utils/copy_object.rb +0 -9
- data/lib/watobo/utils/crypto.rb +0 -9
- data/lib/watobo/utils/expand_range.rb +23 -32
- data/lib/watobo/utils/export_xml.rb +97 -106
- data/lib/watobo/utils/file_management.rb +9 -11
- data/lib/watobo/utils/hexprint.rb +9 -18
- data/lib/watobo/utils/load_chat.rb +0 -9
- data/lib/watobo/utils/load_icon.rb +0 -9
- data/lib/watobo/utils/ntlm.rb +866 -875
- data/lib/watobo/utils/print_debug.rb +12 -21
- data/lib/watobo/utils/response_builder.rb +90 -99
- data/lib/watobo/utils/response_hash.rb +0 -9
- data/lib/watobo/utils/secure_eval.rb +0 -9
- data/lib/watobo/utils/strings.rb +10 -19
- data/lib/watobo/utils/text2request.rb +0 -9
- data/lib/watobo/utils/url.rb +23 -32
- data/lib/watobo/utils/utf16.rb +11 -20
- data/modules/active/Apache/mod_status.rb +0 -9
- data/modules/active/Apache/multiview.rb +151 -160
- data/modules/active/Flash/crossdomain.rb +0 -9
- data/modules/active/JWT/jwt_oauth2_none.rb +111 -0
- data/modules/active/cq5/cq5_default_selectors.rb +106 -115
- data/modules/active/cq5/cqp_user_enumeration.rb +125 -134
- data/modules/active/directories/dirwalker.rb +0 -9
- data/modules/active/discovery/fileextensions.rb +0 -9
- data/modules/active/discovery/http_methods.rb +0 -9
- data/modules/active/discovery/jsmapfiles.rb +79 -0
- data/modules/active/domino/domino_db.rb +68 -76
- data/modules/active/dotNET/custom_errors.rb +102 -111
- data/modules/active/dotNET/dotnet_files.rb +90 -99
- data/modules/active/fileinclusion/lfi_simple.rb +0 -9
- data/modules/active/jboss/jboss_basic.rb +0 -9
- data/modules/active/sap/business_objects.rb +51 -60
- data/modules/active/sap/its_commands.rb +0 -9
- data/modules/active/sap/its_service_parameter.rb +0 -9
- data/modules/active/sap/its_services.rb +0 -9
- data/modules/active/sap/its_xss.rb +0 -9
- data/modules/active/shell_shock/shell_shock.rb +139 -148
- data/modules/active/siebel/siebel_apps.rb +160 -169
- data/modules/active/sqlinjection/sql_boolean.rb +0 -9
- data/modules/active/sqlinjection/sql_numerical.rb +198 -0
- data/modules/active/sqlinjection/sqli_error.rb +0 -9
- data/modules/active/sqlinjection/sqli_timing.rb +220 -229
- data/modules/active/struts2/default_handler_ognl.rb +106 -115
- data/modules/active/struts2/include_params_ognl.rb +105 -114
- data/modules/active/xml/xml_xxe.rb +112 -123
- data/modules/active/xss/xss_ng.rb +214 -223
- data/modules/active/xss/xss_simple.rb +0 -9
- data/modules/passive/ajax.rb +68 -77
- data/modules/passive/autocomplete.rb +56 -65
- data/modules/passive/cookie_options.rb +0 -9
- data/modules/passive/cookie_xss.rb +0 -9
- data/modules/passive/detect_code.rb +0 -9
- data/modules/passive/detect_fileupload.rb +0 -9
- data/modules/passive/detect_infrastructure.rb +0 -9
- data/modules/passive/detect_one_time_tokens.rb +0 -9
- data/modules/passive/dirindexing.rb +0 -9
- data/modules/passive/disclosure_domino.rb +55 -64
- data/modules/passive/disclosure_emails.rb +0 -9
- data/modules/passive/disclosure_ipaddr.rb +55 -53
- data/modules/passive/filename_as_parameter.rb +0 -9
- data/modules/passive/form_spotter.rb +0 -9
- data/modules/passive/hidden_fields.rb +50 -59
- data/modules/passive/hotspots.rb +0 -9
- data/modules/passive/in_script_parameter.rb +0 -9
- data/modules/passive/json_web_token.rb +93 -0
- data/modules/passive/multiple_server_headers.rb +0 -9
- data/modules/passive/possible_login.rb +0 -9
- data/modules/passive/redirect_url.rb +0 -9
- data/modules/passive/redirectionz.rb +0 -9
- data/modules/passive/sap-headers.rb +56 -65
- data/modules/passive/xss_dom.rb +0 -9
- data/plugins/aem/aem.rb +11 -20
- data/plugins/aem/gui/main.rb +118 -127
- data/plugins/aem/gui/tree_view.rb +171 -180
- data/plugins/aem/lib/agent.rb +130 -138
- data/plugins/aem/lib/dispatcher.rb +45 -51
- data/plugins/aem/lib/engine.rb +177 -186
- data/plugins/catalog/catalog.rb +345 -355
- data/plugins/crawler/crawler.rb +4 -13
- data/plugins/crawler/gui.rb +5 -14
- data/plugins/crawler/gui/auth_frame.rb +270 -279
- data/plugins/crawler/gui/crawler_gui.rb +271 -276
- data/plugins/crawler/gui/general_settings_frame.rb +96 -105
- data/plugins/crawler/gui/hooks_frame.rb +80 -89
- data/plugins/crawler/gui/scope_frame.rb +50 -59
- data/plugins/crawler/gui/settings_tabbook.rb +38 -47
- data/plugins/crawler/gui/status_frame.rb +59 -68
- data/plugins/crawler/lib/bags.rb +18 -27
- data/plugins/crawler/lib/constants.rb +11 -20
- data/plugins/crawler/lib/engine.rb +488 -497
- data/plugins/crawler/lib/grabber.rb +68 -77
- data/plugins/crawler/lib/status.rb +71 -80
- data/plugins/crawler/lib/uri_mp.rb +12 -21
- data/plugins/filefinder/filefinder.rb +326 -333
- data/plugins/sqlmap/bin/test.rb +78 -87
- data/plugins/sqlmap/gui.rb +4 -13
- data/plugins/sqlmap/gui/main.rb +218 -227
- data/plugins/sqlmap/gui/options_frame.rb +97 -106
- data/plugins/sqlmap/lib/sqlmap_ctrl.rb +90 -100
- data/plugins/sqlmap/sqlmap.rb +2 -11
- data/plugins/sslchecker/cli/sslchecker_cli.rb +0 -9
- data/plugins/sslchecker/gui/cipher_table.rb +246 -254
- data/plugins/sslchecker/gui/gui.rb +258 -264
- data/plugins/sslchecker/gui/sslchecker.rb +4 -13
- data/plugins/sslchecker/lib/check.rb +127 -133
- data/plugins/wshell/gui/main.rb +119 -117
- data/plugins/wshell/lib/core.rb +38 -88
- data/plugins/wshell/wshell.rb +11 -20
- metadata +170 -164
@@ -1,49 +1,40 @@
|
|
1
|
-
#.
|
2
|
-
# settings_tabbook.rb
|
3
|
-
#.
|
4
|
-
# Copyright 2014 by siberas, http://www.siberas.de
|
5
|
-
# This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
|
6
|
-
# WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
|
7
|
-
# WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
8
|
-
# You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
9
|
-
|
10
1
|
# @private
|
11
|
-
module Watobo#:nodoc: all
|
12
|
-
module Plugin
|
13
|
-
module Crawler
|
14
|
-
class Gui
|
15
|
-
class SettingsTabBook < FXTabBook
|
16
|
-
attr :hooks, :general, :log_viewer, :auth, :scope
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
def initialize(owner)
|
21
|
-
#@tab = FXTabBook.new(self, nil, 0, LAYOUT_FILL_X|LAYOUT_FILL_Y|LAYOUT_RIGHT)
|
22
|
-
super(owner, nil, 0, LAYOUT_FILL_X|LAYOUT_FILL_Y|LAYOUT_RIGHT)
|
23
|
-
FXTabItem.new(self, "General", nil)
|
24
|
-
# frame = FXVerticalFrame.new(self, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_RAISED)
|
25
|
-
@general = GeneralSettingsFrame.new(self)
|
26
|
-
|
27
|
-
FXTabItem.new(self, "Scope", nil)
|
28
|
-
@scope = ScopeFrame.new(self)
|
29
|
-
|
30
|
-
FXTabItem.new(self, "Auth", nil)
|
31
|
-
@auth = AuthFrame.new(self)
|
32
|
-
|
33
|
-
|
34
|
-
FXTabItem.new(self, "Hooks", nil)
|
35
|
-
@hooks = HooksFrame.new(self)
|
36
|
-
|
37
|
-
FXTabItem.new(self, "Log", nil)
|
38
|
-
frame = FXVerticalFrame.new(self, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_THICK|FRAME_RAISED)
|
39
|
-
@log_viewer = Watobo::Gui::LogViewer.new(frame, :append, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_SUNKEN)
|
40
|
-
|
41
|
-
self.connect(SEL_COMMAND){
|
42
|
-
@hooks.selected if self.current == 3
|
43
|
-
}
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
2
|
+
module Watobo#:nodoc: all
|
3
|
+
module Plugin
|
4
|
+
module Crawler
|
5
|
+
class Gui
|
6
|
+
class SettingsTabBook < FXTabBook
|
7
|
+
attr :hooks, :general, :log_viewer, :auth, :scope
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
def initialize(owner)
|
12
|
+
#@tab = FXTabBook.new(self, nil, 0, LAYOUT_FILL_X|LAYOUT_FILL_Y|LAYOUT_RIGHT)
|
13
|
+
super(owner, nil, 0, LAYOUT_FILL_X|LAYOUT_FILL_Y|LAYOUT_RIGHT)
|
14
|
+
FXTabItem.new(self, "General", nil)
|
15
|
+
# frame = FXVerticalFrame.new(self, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_RAISED)
|
16
|
+
@general = GeneralSettingsFrame.new(self)
|
17
|
+
|
18
|
+
FXTabItem.new(self, "Scope", nil)
|
19
|
+
@scope = ScopeFrame.new(self)
|
20
|
+
|
21
|
+
FXTabItem.new(self, "Auth", nil)
|
22
|
+
@auth = AuthFrame.new(self)
|
23
|
+
|
24
|
+
|
25
|
+
FXTabItem.new(self, "Hooks", nil)
|
26
|
+
@hooks = HooksFrame.new(self)
|
27
|
+
|
28
|
+
FXTabItem.new(self, "Log", nil)
|
29
|
+
frame = FXVerticalFrame.new(self, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_THICK|FRAME_RAISED)
|
30
|
+
@log_viewer = Watobo::Gui::LogViewer.new(frame, :append, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_SUNKEN)
|
31
|
+
|
32
|
+
self.connect(SEL_COMMAND){
|
33
|
+
@hooks.selected if self.current == 3
|
34
|
+
}
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
49
40
|
end
|
@@ -1,71 +1,62 @@
|
|
1
|
-
#.
|
2
|
-
# status_frame.rb
|
3
|
-
#.
|
4
|
-
# Copyright 2014 by siberas, http://www.siberas.de
|
5
|
-
# This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
|
6
|
-
# WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
|
7
|
-
# WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
8
|
-
# You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
9
|
-
|
10
1
|
# @private
|
11
|
-
module Watobo#:nodoc: all
|
12
|
-
module Plugin
|
13
|
-
module Crawler
|
14
|
-
class Gui
|
15
|
-
class StatusFrame < FXHorizontalFrame
|
16
|
-
|
17
|
-
include Watobo::Plugin::Crawler::Constants
|
18
|
-
# :engine_status => CRAWL_NONE,
|
19
|
-
# :page_size => 0,
|
20
|
-
# :link_size => 0,
|
21
|
-
# :skipped_domains => 0
|
2
|
+
module Watobo#:nodoc: all
|
3
|
+
module Plugin
|
4
|
+
module Crawler
|
5
|
+
class Gui
|
6
|
+
class StatusFrame < FXHorizontalFrame
|
7
|
+
|
8
|
+
include Watobo::Plugin::Crawler::Constants
|
9
|
+
# :engine_status => CRAWL_NONE,
|
10
|
+
# :page_size => 0,
|
11
|
+
# :link_size => 0,
|
12
|
+
# :skipped_domains => 0
|
22
13
|
def update_status(status)
|
23
|
-
#puts status.to_yaml
|
24
|
-
if status.has_key? :engine_status
|
25
|
-
case status[:engine_status]
|
26
|
-
when CRAWL_NONE
|
27
|
-
self.backColor = self.parent.backColor
|
28
|
-
@status_txt.text = "Status: Idle"
|
29
|
-
when CRAWL_RUNNING
|
30
|
-
self.backColor = FXColor::Red
|
31
|
-
@status_txt.text = "Status: Running"
|
32
|
-
|
33
|
-
when CRAWL_PAUSED
|
34
|
-
self.backColor = FXColor::Yellow
|
35
|
-
@status_txt.text = "Status: Paused"
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
if status.has_key? :link_size
|
40
|
-
@link_size_txt.text = "Links: #{status[:link_size]}"
|
41
|
-
end
|
42
|
-
|
43
|
-
if status.has_key? :page_size
|
44
|
-
@page_size_txt.text = "Pages: #{status[:page_size]}"
|
45
|
-
end
|
46
|
-
|
47
|
-
if status.has_key? :total_requests
|
48
|
-
@requests_txt.text = "Requests: #{status[:total_requests]}"
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
def initialize(owner)
|
53
|
-
super(owner, :opts => LAYOUT_FILL_X|FRAME_RAISED)
|
54
|
-
@info_fields = []
|
55
|
-
#frame = FXHorizontalFrame.new(, :opts => LAYOUT_FILL_Y, :padding => 0)
|
56
|
-
frame = self
|
57
|
-
@info_fields << ( @status_txt = FXLabel.new(frame, "Status: Stopped", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 100) )
|
58
|
-
@info_fields << (@link_size_txt = FXLabel.new(frame, "Links: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 70) )
|
59
|
-
@info_fields << (@page_size_txt = FXLabel.new(frame, "Pages: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 70) )
|
60
|
-
@info_fields << (@requests_txt = FXLabel.new(frame, "Requests: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 100) )
|
61
|
-
|
62
|
-
@info_fields.each do |i|
|
63
|
-
i.justify = JUSTIFY_LEFT
|
64
|
-
end
|
65
|
-
end
|
66
|
-
|
67
|
-
end
|
68
|
-
end
|
69
|
-
end
|
70
|
-
end
|
14
|
+
#puts status.to_yaml
|
15
|
+
if status.has_key? :engine_status
|
16
|
+
case status[:engine_status]
|
17
|
+
when CRAWL_NONE
|
18
|
+
self.backColor = self.parent.backColor
|
19
|
+
@status_txt.text = "Status: Idle"
|
20
|
+
when CRAWL_RUNNING
|
21
|
+
self.backColor = FXColor::Red
|
22
|
+
@status_txt.text = "Status: Running"
|
23
|
+
|
24
|
+
when CRAWL_PAUSED
|
25
|
+
self.backColor = FXColor::Yellow
|
26
|
+
@status_txt.text = "Status: Paused"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
if status.has_key? :link_size
|
31
|
+
@link_size_txt.text = "Links: #{status[:link_size]}"
|
32
|
+
end
|
33
|
+
|
34
|
+
if status.has_key? :page_size
|
35
|
+
@page_size_txt.text = "Pages: #{status[:page_size]}"
|
36
|
+
end
|
37
|
+
|
38
|
+
if status.has_key? :total_requests
|
39
|
+
@requests_txt.text = "Requests: #{status[:total_requests]}"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def initialize(owner)
|
44
|
+
super(owner, :opts => LAYOUT_FILL_X|FRAME_RAISED)
|
45
|
+
@info_fields = []
|
46
|
+
#frame = FXHorizontalFrame.new(, :opts => LAYOUT_FILL_Y, :padding => 0)
|
47
|
+
frame = self
|
48
|
+
@info_fields << ( @status_txt = FXLabel.new(frame, "Status: Stopped", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 100) )
|
49
|
+
@info_fields << (@link_size_txt = FXLabel.new(frame, "Links: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 70) )
|
50
|
+
@info_fields << (@page_size_txt = FXLabel.new(frame, "Pages: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 70) )
|
51
|
+
@info_fields << (@requests_txt = FXLabel.new(frame, "Requests: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 100) )
|
52
|
+
|
53
|
+
@info_fields.each do |i|
|
54
|
+
i.justify = JUSTIFY_LEFT
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
71
62
|
end
|
data/plugins/crawler/lib/bags.rb
CHANGED
@@ -1,29 +1,20 @@
|
|
1
|
-
#.
|
2
|
-
# bags.rb
|
3
|
-
#.
|
4
|
-
# Copyright 2014 by siberas, http://www.siberas.de
|
5
|
-
# This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
|
6
|
-
# WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
|
7
|
-
# WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
8
|
-
# You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
9
|
-
|
10
1
|
# @private
|
11
|
-
module Watobo#:nodoc: all
|
12
|
-
module Crawler
|
13
|
-
class PageBag
|
14
|
-
attr :page, :depth
|
15
|
-
def initialize(page, depth)
|
16
|
-
@page = page
|
17
|
-
@depth = depth
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
class LinkBag
|
22
|
-
attr :link, :depth
|
23
|
-
def initialize(link, depth)
|
24
|
-
@link = link
|
25
|
-
@depth = depth
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
2
|
+
module Watobo#:nodoc: all
|
3
|
+
module Crawler
|
4
|
+
class PageBag
|
5
|
+
attr :page, :depth
|
6
|
+
def initialize(page, depth)
|
7
|
+
@page = page
|
8
|
+
@depth = depth
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
class LinkBag
|
13
|
+
attr :link, :depth
|
14
|
+
def initialize(link, depth)
|
15
|
+
@link = link
|
16
|
+
@depth = depth
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
29
20
|
end
|
@@ -1,22 +1,13 @@
|
|
1
|
-
#.
|
2
|
-
# constants.rb
|
3
|
-
#.
|
4
|
-
# Copyright 2014 by siberas, http://www.siberas.de
|
5
|
-
# This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
|
6
|
-
# WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
|
7
|
-
# WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
8
|
-
# You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
9
|
-
|
10
1
|
# @private
|
11
|
-
module Watobo#:nodoc: all
|
12
|
-
module Plugin
|
13
|
-
module Crawler
|
14
|
-
module Constants
|
15
|
-
CRAWL_NONE = 0x00
|
16
|
-
CRAWL_RUNNING = 0x01
|
17
|
-
CRAWL_PAUSED = 0x02
|
18
|
-
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|
2
|
+
module Watobo#:nodoc: all
|
3
|
+
module Plugin
|
4
|
+
module Crawler
|
5
|
+
module Constants
|
6
|
+
CRAWL_NONE = 0x00
|
7
|
+
CRAWL_RUNNING = 0x01
|
8
|
+
CRAWL_PAUSED = 0x02
|
9
|
+
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
22
13
|
end
|
@@ -1,517 +1,508 @@
|
|
1
|
-
#.
|
2
|
-
# engine.rb
|
3
|
-
#.
|
4
|
-
# Copyright 2014 by siberas, http://www.siberas.de
|
5
|
-
# This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
|
6
|
-
# WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
|
7
|
-
# WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
8
|
-
# You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
9
|
-
|
10
1
|
# @private
|
11
|
-
module Watobo#:nodoc: all
|
12
|
-
module Crawler
|
13
|
-
|
14
|
-
class Agent < Mechanize
|
15
|
-
|
16
|
-
def initialize(opts)
|
17
|
-
super()
|
18
|
-
|
19
|
-
|
20
|
-
self.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
21
|
-
self.ignore_bad_chunking = true
|
22
|
-
self.keep_alive = false
|
23
|
-
|
24
|
-
self.user_agent = opts[:user_agent] if opts.has_key?(:user_agent)
|
25
|
-
|
26
|
-
if opts.has_key? :username and opts.has_key? :password
|
27
|
-
unless opts[:username].empty? and opts[:password].empty?
|
28
|
-
|
29
|
-
user = opts[:username]
|
30
|
-
pw = opts[:password]
|
31
|
-
uri = opts[:auth_uri]
|
32
|
-
# puts "Got Credentials for #{uri}: #{user} / #{pw}"
|
33
|
-
self.add_auth(uri, user , pw )
|
34
|
-
# TODO: remove this workaround for a Mechanize Bug (#243)
|
35
|
-
p = self.get uri
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
if ( opts.has_key? :proxy_host ) && ( opts.has_key? :proxy_port )
|
40
|
-
self.set_proxy( opts[:proxy_host], opts[:proxy_port] )
|
41
|
-
end
|
42
|
-
|
43
|
-
if opts.has_key? :pre_connect_hook
|
44
|
-
self.pre_connect_hooks << opts[:pre_connect_hook] if opts[:pre_connect_hook].respond_to? :call
|
45
|
-
end
|
46
|
-
|
47
|
-
unless opts[:cookie_jar].nil?
|
48
|
-
clean_jar = Mechanize::CookieJar.new
|
49
|
-
opts[:cookie_jar].each{ |cookie|
|
50
|
-
clean_jar.add! cookie
|
51
|
-
}
|
52
|
-
self.cookie_jar = clean_jar
|
53
|
-
end
|
54
|
-
|
55
|
-
end
|
56
|
-
|
57
|
-
end
|
58
|
-
|
59
|
-
class Engine
|
60
|
-
include Watobo::Plugin::Crawler::Constants
|
61
|
-
|
62
|
-
def subscribe(event, &callback)
|
63
|
-
(@event_dispatcher_listeners[event] ||= []) << callback
|
64
|
-
end
|
65
|
-
|
66
|
-
def clearEvents(event)
|
67
|
-
@event_dispatcher_listeners[event] ||= []
|
68
|
-
@event_dispatcher_listeners[event].clear
|
69
|
-
end
|
70
|
-
|
71
|
-
def notify(event, *args)
|
72
|
-
if @event_dispatcher_listeners[event]
|
73
|
-
# puts "NOTIFY: #{self}(:#{event}) [#{@event_dispatcher_listeners[event].length}]" if $DEBUG
|
74
|
-
@event_dispatcher_listeners[event].each do |m|
|
75
|
-
m.call(*args) if m.respond_to? :call
|
76
|
-
end
|
77
|
-
end
|
78
|
-
end
|
79
|
-
|
80
|
-
def settings
|
81
|
-
@opts
|
82
|
-
end
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
def get_page(url, opts={})
|
87
|
-
ro = {}.update @opts
|
88
|
-
ro.update opts
|
89
|
-
agent = Crawler::Agent.new(ro)
|
90
|
-
page = nil
|
91
|
-
page = agent.get url
|
92
|
-
return agent, page
|
93
|
-
end
|
94
|
-
|
95
|
-
def initialize(opts={})
|
96
|
-
@event_dispatcher_listeners = Hash.new
|
97
|
-
@status_lock = Mutex.new
|
98
|
-
|
99
|
-
@opts = {
|
100
|
-
:submit_forms => true,
|
101
|
-
:max_depth => 5,
|
102
|
-
:max_repeat => 20,
|
103
|
-
:max_threads => 4,
|
104
|
-
:user_agent => "watobo-crawler",
|
105
|
-
:proxy_host => '127.0.0.1',
|
106
|
-
:proxy_port => Watobo::Conf::Interceptor.port,
|
107
|
-
:delay => 0,
|
108
|
-
:head_request_pattern => '(pdf|swf|doc|flv|jpg|png|gif)',
|
109
|
-
:allowed_hosts => [], # regex's
|
110
|
-
:allowed_urls => [], # regex's
|
111
|
-
:excluded_urls => ["logout"], # regex's
|
112
|
-
:excluded_fields => ["userid","username","password"], # regex's'
|
113
|
-
:excluded_form_names => [], # regex's'
|
114
|
-
:root_path => "", # regex
|
115
|
-
:username => "",
|
116
|
-
:password => "",
|
117
|
-
:auth_uri => nil,
|
118
|
-
:auth_domain => "", # for ntlm auth
|
119
|
-
:cookie_jar => nil
|
120
|
-
}
|
121
|
-
|
122
|
-
@opts.update opts
|
123
|
-
@opts[:head_request_pattern] = '' if @opts[:head_request_pattern].nil?
|
124
|
-
|
125
|
-
@stats = {
|
126
|
-
:total_requests => 0
|
127
|
-
}
|
128
|
-
|
129
|
-
@link_keys = Hash.new
|
130
|
-
@link_counts = Hash.new
|
131
|
-
|
132
|
-
@form_keys = Hash.new
|
133
|
-
@form_counts = Hash.new
|
134
|
-
|
135
|
-
end
|
136
|
-
|
137
|
-
def pause
|
138
|
-
false
|
139
|
-
end
|
140
|
-
|
141
|
-
def cancel
|
142
|
-
puts "[CRAWLER] - CANCEL!!"
|
143
|
-
#@status_lock.synchronize do
|
144
|
-
# @engine_status = CRAWL_NONE
|
2
|
+
module Watobo#:nodoc: all
|
3
|
+
module Crawler
|
4
|
+
|
5
|
+
class Agent < Mechanize
|
6
|
+
|
7
|
+
def initialize(opts)
|
8
|
+
super()
|
9
|
+
|
10
|
+
|
11
|
+
self.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
12
|
+
self.ignore_bad_chunking = true
|
13
|
+
self.keep_alive = false
|
14
|
+
|
15
|
+
self.user_agent = opts[:user_agent] if opts.has_key?(:user_agent)
|
16
|
+
|
17
|
+
if opts.has_key? :username and opts.has_key? :password
|
18
|
+
unless opts[:username].empty? and opts[:password].empty?
|
19
|
+
|
20
|
+
user = opts[:username]
|
21
|
+
pw = opts[:password]
|
22
|
+
uri = opts[:auth_uri]
|
23
|
+
# puts "Got Credentials for #{uri}: #{user} / #{pw}"
|
24
|
+
self.add_auth(uri, user , pw )
|
25
|
+
# TODO: remove this workaround for a Mechanize Bug (#243)
|
26
|
+
p = self.get uri
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
if ( opts.has_key? :proxy_host ) && ( opts.has_key? :proxy_port )
|
31
|
+
self.set_proxy( opts[:proxy_host], opts[:proxy_port] )
|
32
|
+
end
|
33
|
+
|
34
|
+
if opts.has_key? :pre_connect_hook
|
35
|
+
self.pre_connect_hooks << opts[:pre_connect_hook] if opts[:pre_connect_hook].respond_to? :call
|
36
|
+
end
|
37
|
+
|
38
|
+
unless opts[:cookie_jar].nil?
|
39
|
+
clean_jar = Mechanize::CookieJar.new
|
40
|
+
opts[:cookie_jar].each{ |cookie|
|
41
|
+
clean_jar.add! cookie
|
42
|
+
}
|
43
|
+
self.cookie_jar = clean_jar
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
class Engine
|
51
|
+
include Watobo::Plugin::Crawler::Constants
|
52
|
+
|
53
|
+
def subscribe(event, &callback)
|
54
|
+
(@event_dispatcher_listeners[event] ||= []) << callback
|
55
|
+
end
|
56
|
+
|
57
|
+
def clearEvents(event)
|
58
|
+
@event_dispatcher_listeners[event] ||= []
|
59
|
+
@event_dispatcher_listeners[event].clear
|
60
|
+
end
|
61
|
+
|
62
|
+
def notify(event, *args)
|
63
|
+
if @event_dispatcher_listeners[event]
|
64
|
+
# puts "NOTIFY: #{self}(:#{event}) [#{@event_dispatcher_listeners[event].length}]" if $DEBUG
|
65
|
+
@event_dispatcher_listeners[event].each do |m|
|
66
|
+
m.call(*args) if m.respond_to? :call
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def settings
|
72
|
+
@opts
|
73
|
+
end
|
74
|
+
|
75
|
+
|
76
|
+
|
77
|
+
def get_page(url, opts={})
|
78
|
+
ro = {}.update @opts
|
79
|
+
ro.update opts
|
80
|
+
agent = Crawler::Agent.new(ro)
|
81
|
+
page = nil
|
82
|
+
page = agent.get url
|
83
|
+
return agent, page
|
84
|
+
end
|
85
|
+
|
86
|
+
def initialize(opts={})
|
87
|
+
@event_dispatcher_listeners = Hash.new
|
88
|
+
@status_lock = Mutex.new
|
89
|
+
|
90
|
+
@opts = {
|
91
|
+
:submit_forms => true,
|
92
|
+
:max_depth => 5,
|
93
|
+
:max_repeat => 20,
|
94
|
+
:max_threads => 4,
|
95
|
+
:user_agent => "watobo-crawler",
|
96
|
+
:proxy_host => '127.0.0.1',
|
97
|
+
:proxy_port => Watobo::Conf::Interceptor.port,
|
98
|
+
:delay => 0,
|
99
|
+
:head_request_pattern => '(pdf|swf|doc|flv|jpg|png|gif)',
|
100
|
+
:allowed_hosts => [], # regex's
|
101
|
+
:allowed_urls => [], # regex's
|
102
|
+
:excluded_urls => ["logout"], # regex's
|
103
|
+
:excluded_fields => ["userid","username","password"], # regex's'
|
104
|
+
:excluded_form_names => [], # regex's'
|
105
|
+
:root_path => "", # regex
|
106
|
+
:username => "",
|
107
|
+
:password => "",
|
108
|
+
:auth_uri => nil,
|
109
|
+
:auth_domain => "", # for ntlm auth
|
110
|
+
:cookie_jar => nil
|
111
|
+
}
|
112
|
+
|
113
|
+
@opts.update opts
|
114
|
+
@opts[:head_request_pattern] = '' if @opts[:head_request_pattern].nil?
|
115
|
+
|
116
|
+
@stats = {
|
117
|
+
:total_requests => 0
|
118
|
+
}
|
119
|
+
|
120
|
+
@link_keys = Hash.new
|
121
|
+
@link_counts = Hash.new
|
122
|
+
|
123
|
+
@form_keys = Hash.new
|
124
|
+
@form_counts = Hash.new
|
125
|
+
|
126
|
+
end
|
127
|
+
|
128
|
+
def pause
|
129
|
+
false
|
130
|
+
end
|
131
|
+
|
132
|
+
def cancel
|
133
|
+
puts "[CRAWLER] - CANCEL!!"
|
134
|
+
#@status_lock.synchronize do
|
135
|
+
# @engine_status = CRAWL_NONE
|
145
136
|
#end
|
146
|
-
Watobo::Crawler::Status.engine = CRAWL_NONE
|
147
|
-
@grabber_threads.each do |gt|
|
148
|
-
puts "Killing Thread #{gt}"
|
149
|
-
gt.kill
|
150
|
-
gt.raise "CANCEL"
|
151
|
-
end
|
152
|
-
@grabber_threads.each{|t| t.join }
|
153
|
-
|
154
|
-
@link_queue.clear
|
155
|
-
@page_queue.clear
|
156
|
-
@grabber_threads.clear
|
157
|
-
@link_keys.clear
|
158
|
-
@link_counts.clear
|
159
|
-
|
160
|
-
@form_keys.clear
|
161
|
-
@form_counts.clear
|
162
|
-
|
163
|
-
#notify( :update_status, current_status )
|
164
|
-
puts "CANCELED - CANCELED"
|
165
|
-
# exit
|
166
|
-
end
|
167
|
-
|
137
|
+
Watobo::Crawler::Status.engine = CRAWL_NONE
|
138
|
+
@grabber_threads.each do |gt|
|
139
|
+
puts "Killing Thread #{gt}"
|
140
|
+
gt.kill
|
141
|
+
gt.raise "CANCEL"
|
142
|
+
end
|
143
|
+
@grabber_threads.each{|t| t.join }
|
144
|
+
|
145
|
+
@link_queue.clear
|
146
|
+
@page_queue.clear
|
147
|
+
@grabber_threads.clear
|
148
|
+
@link_keys.clear
|
149
|
+
@link_counts.clear
|
150
|
+
|
151
|
+
@form_keys.clear
|
152
|
+
@form_counts.clear
|
153
|
+
|
154
|
+
#notify( :update_status, current_status )
|
155
|
+
puts "CANCELED - CANCELED"
|
156
|
+
# exit
|
157
|
+
end
|
158
|
+
|
168
159
|
def run(url, opts={})
|
169
160
|
#engine_status = CRAWL_RUNNING
|
170
161
|
Watobo::Crawler::Status.reset
|
171
162
|
Watobo::Crawler::Status.engine = CRAWL_RUNNING
|
172
|
-
|
173
|
-
@opts.update opts
|
163
|
+
|
164
|
+
@opts.update opts
|
174
165
|
@opts[:head_request_pattern] = '' if @opts[:head_request_pattern].nil?
|
175
166
|
|
176
167
|
puts "crawler settings:"
|
177
168
|
puts @opts.to_json
|
178
|
-
|
179
|
-
|
180
|
-
@link_queue = Queue.new
|
181
|
-
@page_queue = Queue.new
|
169
|
+
|
170
|
+
|
171
|
+
@link_queue = Queue.new
|
172
|
+
@page_queue = Queue.new
|
173
|
+
|
174
|
+
@link_keys = Hash.new
|
175
|
+
@link_counts = Hash.new
|
182
176
|
|
183
|
-
@
|
184
|
-
@
|
185
|
-
|
186
|
-
@
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
start_link
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
process_links(pagebag)
|
213
|
-
|
214
|
-
process_forms(pagebag)
|
177
|
+
@form_keys = Hash.new
|
178
|
+
@form_counts = Hash.new
|
179
|
+
|
180
|
+
@skipped_sites = Hash.new
|
181
|
+
|
182
|
+
@grabber_threads = []
|
183
|
+
start_link = URI.parse url
|
184
|
+
return false if start_link.host.nil?
|
185
|
+
|
186
|
+
allow_host(start_link)
|
187
|
+
|
188
|
+
@link_queue.enq LinkBag.new(start_link, 0)
|
189
|
+
|
190
|
+
|
191
|
+
notify(:log, "Crawling #{url} started ..." )
|
192
|
+
|
193
|
+
@opts[:max_threads].times do |i|
|
194
|
+
g = Grabber.new(@link_queue, @page_queue, @opts )
|
195
|
+
@grabber_threads << g.run
|
196
|
+
end
|
197
|
+
|
198
|
+
puts "* startet #{@grabber_threads.length} grabbers"
|
199
|
+
|
200
|
+
loop do
|
201
|
+
pagebag = @page_queue.deq
|
202
|
+
|
203
|
+
process_links(pagebag)
|
204
|
+
|
205
|
+
process_forms(pagebag)
|
215
206
|
#@stats[:total_requests] += 1 unless pagebag.nil?
|
216
207
|
Watobo::Crawler::Status.inc_requests() unless pagebag.nil?
|
217
208
|
Watobo::Crawler::Status.page_size= @page_queue.size
|
218
|
-
Watobo::Crawler::Status.link_size= @link_queue.size
|
219
|
-
|
220
|
-
puts "Links/Pages: #{@link_queue.size}/#{@page_queue.size}"
|
221
|
-
#notify( :update_status, current_status )
|
222
|
-
# if @link_queue.empty? and @page_queue.empty?
|
223
|
-
if @page_queue.empty?
|
224
|
-
# if page_queue is empty wait for all grabber threads finishing the link_queue
|
225
|
-
until @link_queue.num_waiting == @grabber_threads.length
|
226
|
-
Thread.pass
|
227
|
-
end
|
228
|
-
# when the link_queue is finished check the page_queue. Crawling is finished if page_queue is empty too.
|
229
|
-
if @page_queue.empty?
|
230
|
-
@grabber_threads.each { |t| t.kill }
|
231
|
-
puts "Finished Crawling"
|
209
|
+
Watobo::Crawler::Status.link_size= @link_queue.size
|
210
|
+
|
211
|
+
puts "Links/Pages: #{@link_queue.size}/#{@page_queue.size}"
|
212
|
+
#notify( :update_status, current_status )
|
213
|
+
# if @link_queue.empty? and @page_queue.empty?
|
214
|
+
if @page_queue.empty?
|
215
|
+
# if page_queue is empty wait for all grabber threads finishing the link_queue
|
216
|
+
until @link_queue.num_waiting == @grabber_threads.length
|
217
|
+
Thread.pass
|
218
|
+
end
|
219
|
+
# when the link_queue is finished check the page_queue. Crawling is finished if page_queue is empty too.
|
220
|
+
if @page_queue.empty?
|
221
|
+
@grabber_threads.each { |t| t.kill }
|
222
|
+
puts "Finished Crawling"
|
232
223
|
#@status_lock.synchronize{ @engine_status = CRAWL_NONE }
|
233
224
|
Watobo::Crawler::Status.engine = CRAWL_NONE
|
234
|
-
|
235
|
-
notify(:log, "Crawling finished")
|
236
|
-
#notify( :update_status, current_status )
|
237
|
-
break
|
238
|
-
|
239
|
-
end
|
240
|
-
end
|
241
|
-
|
242
|
-
end
|
243
|
-
|
244
|
-
end
|
245
|
-
|
246
|
-
private
|
247
|
-
|
248
|
-
def current_status
|
225
|
+
|
226
|
+
notify(:log, "Crawling finished")
|
227
|
+
#notify( :update_status, current_status )
|
228
|
+
break
|
229
|
+
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
end
|
234
|
+
|
235
|
+
end
|
236
|
+
|
237
|
+
private
|
238
|
+
|
239
|
+
def current_status
|
249
240
|
{
|
250
|
-
:engine_status => @engine_status,
|
251
|
-
:link_size => @link_queue.size,
|
252
|
-
:page_size => @page_queue.size
|
253
|
-
}.update @stats
|
254
|
-
|
255
|
-
end
|
256
|
-
|
257
|
-
|
258
|
-
def allow_host(uri)
|
259
|
-
if uri.is_a? URI
|
260
|
-
site = uri.site.to_s
|
261
|
-
# puts "Valid Site: #{site}"
|
262
|
-
ah = allowed_hosts
|
263
|
-
ah << site
|
264
|
-
end
|
265
|
-
end
|
266
|
-
|
267
|
-
def process_forms(pagebag)
|
268
|
-
return false unless pagebag.respond_to? :page
|
269
|
-
page=pagebag.page
|
270
|
-
return false unless page.respond_to? :forms
|
271
|
-
page.forms.each do |f|
|
272
|
-
|
273
|
-
action = page.uri.merge f.action unless f.action =~ /^http/
|
274
|
-
f.action = action.to_s
|
275
|
-
|
276
|
-
if send_form? f
|
277
|
-
# puts "SUBMIT FORM: #{f.action}"
|
278
|
-
send_form(f, pagebag.depth)
|
279
|
-
end
|
280
|
-
end
|
281
|
-
end
|
282
|
-
|
283
|
-
def process_links(pagebag)
|
284
|
-
return false unless pagebag.respond_to? :page
|
285
|
-
page = pagebag.page
|
286
|
-
return false unless page.respond_to? :links
|
287
|
-
|
288
|
-
page.links.each do |l|
|
289
|
-
begin
|
241
|
+
:engine_status => @engine_status,
|
242
|
+
:link_size => @link_queue.size,
|
243
|
+
:page_size => @page_queue.size
|
244
|
+
}.update @stats
|
245
|
+
|
246
|
+
end
|
247
|
+
|
248
|
+
|
249
|
+
def allow_host(uri)
|
250
|
+
if uri.is_a? URI
|
251
|
+
site = uri.site.to_s
|
252
|
+
# puts "Valid Site: #{site}"
|
253
|
+
ah = allowed_hosts
|
254
|
+
ah << site
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
def process_forms(pagebag)
|
259
|
+
return false unless pagebag.respond_to? :page
|
260
|
+
page=pagebag.page
|
261
|
+
return false unless page.respond_to? :forms
|
262
|
+
page.forms.each do |f|
|
263
|
+
|
264
|
+
action = page.uri.merge f.action unless f.action =~ /^http/
|
265
|
+
f.action = action.to_s
|
266
|
+
|
267
|
+
if send_form? f
|
268
|
+
# puts "SUBMIT FORM: #{f.action}"
|
269
|
+
send_form(f, pagebag.depth)
|
270
|
+
end
|
271
|
+
end
|
272
|
+
end
|
273
|
+
|
274
|
+
def process_links(pagebag)
|
275
|
+
return false unless pagebag.respond_to? :page
|
276
|
+
page = pagebag.page
|
277
|
+
return false unless page.respond_to? :links
|
278
|
+
|
279
|
+
page.links.each do |l|
|
280
|
+
begin
|
290
281
|
link = l
|
291
282
|
next if l.href.nil?
|
292
|
-
|
293
|
-
link = page.uri.merge l.uri unless l.href =~ /^http/
|
294
|
-
# puts "FOLLOW LINK #{link} ?"
|
295
|
-
if follow_link? link
|
296
|
-
# puts ">> OK"
|
297
|
-
submit_link(link, pagebag.depth)
|
298
|
-
else
|
299
|
-
# puts ">> NO"
|
300
|
-
end
|
301
|
-
rescue => bang
|
283
|
+
|
284
|
+
link = page.uri.merge l.uri unless l.href =~ /^http/
|
285
|
+
# puts "FOLLOW LINK #{link} ?"
|
286
|
+
if follow_link? link
|
287
|
+
# puts ">> OK"
|
288
|
+
submit_link(link, pagebag.depth)
|
289
|
+
else
|
290
|
+
# puts ">> NO"
|
291
|
+
end
|
292
|
+
rescue => bang
|
302
293
|
puts bang
|
303
|
-
puts bang.backtrace if $DEBUG
|
304
|
-
end
|
305
|
-
end
|
306
|
-
|
307
|
-
end
|
308
|
-
|
309
|
-
|
310
|
-
def submit_link(link, depth)
|
311
|
-
# @link_keys[link_key(link)] = link
|
312
|
-
|
313
|
-
clk = link_key(link, :clear_values => true)
|
314
|
-
@link_counts[clk] ||= 0
|
315
|
-
@link_counts[clk] += 1
|
316
|
-
lk = link_key(link)
|
317
|
-
return false if @link_keys.has_key? lk
|
318
|
-
@link_keys[lk] = nil
|
319
|
-
if @link_counts[clk] < @opts[:max_repeat]
|
320
|
-
@link_queue.enq LinkBag.new(link, depth)
|
321
|
-
else
|
322
|
-
puts "! MAX REPEAT !\nSkipped link #{link}" if $DEBUG
|
323
|
-
end
|
324
|
-
end
|
325
|
-
|
326
|
-
def form_key(form, opts={} )
|
327
|
-
o = { :clear_values => false }
|
328
|
-
o.update opts
|
329
|
-
|
330
|
-
fp = "#{form.action}"
|
331
|
-
fp << form.method
|
332
|
-
if form.request_data =~ /=/
|
333
|
-
data = form.request_data.split("&").sort.join("&")
|
334
|
-
if o[:clear_values]
|
335
|
-
fp << data.gsub(/=[^&]*/,'=')
|
336
|
-
else
|
337
|
-
fp << data
|
338
|
-
end
|
339
|
-
end
|
340
|
-
fkey = Digest::MD5.hexdigest fp
|
341
|
-
fkey
|
342
|
-
end
|
343
|
-
|
344
|
-
def send_form(form, depth)
|
345
|
-
return false if @engine_status == CRAWL_NONE
|
346
|
-
cfk = form_key(form, :clear_values => true)
|
347
|
-
@form_counts[cfk] ||= 0
|
348
|
-
@form_counts[cfk] += 1
|
349
|
-
|
350
|
-
# @form_keys[form_key(form)] = form
|
351
|
-
fk = form_key(form)
|
352
|
-
return false if @form_keys.has_key? fk
|
353
|
-
@form_keys[fk] = nil
|
354
|
-
begin
|
355
|
-
if @form_counts[cfk] < @opts[:max_repeat]
|
356
|
-
if form.buttons.length > 0
|
357
|
-
p = form.click_button
|
358
|
-
else
|
359
|
-
p = form.submit()
|
360
|
-
end
|
361
|
-
puts p.class
|
362
|
-
@page_queue.enq PageBag.new(p, depth+1)
|
363
|
-
else
|
364
|
-
puts "! MAX REPEAT !\nSkipped Form #{form.action}"
|
365
|
-
end
|
366
|
-
rescue => bang
|
367
|
-
puts bang
|
368
|
-
puts bang.backtrace
|
369
|
-
end
|
370
|
-
end
|
371
|
-
|
372
|
-
def send_form?(form)
|
373
|
-
# puts "SEND FORM?"
|
374
|
-
return false unless engine_running?
|
375
|
-
return false unless @opts[:submit_forms] == true
|
376
|
-
# puts "> submit_forms"
|
377
|
-
return false unless allowed? form.action
|
378
|
-
#puts "> allowed"
|
379
|
-
return false unless fields_allowed? form
|
380
|
-
#puts "> fields allowed"
|
381
|
-
return false if form_sent? form
|
382
|
-
# puts "> form not sent"
|
383
|
-
return true
|
384
|
-
end
|
385
|
-
|
386
|
-
def follow_link?(link)
|
387
|
-
return false unless allowed? link
|
388
|
-
return false if link_is_followed? link
|
389
|
-
return true
|
390
|
-
end
|
391
|
-
|
392
|
-
def host_allowed?(uri)
|
393
|
-
#puts "ALLOWED HOSTS =>"
|
394
|
-
#puts allowed_hosts
|
395
|
-
#puts "---"
|
396
|
-
# puts "Host Allowed?"
|
397
|
-
ah = allowed_hosts
|
398
|
-
# puts ah.class
|
399
|
-
#puts ah
|
400
|
-
return false if ah.empty?
|
401
|
-
ahc = ah.select{ |h| uri.site =~ /^#{h}$/ }.length
|
402
|
-
if ahc > 0
|
403
|
-
# puts "> Host IS allowed!"
|
404
|
-
return true
|
405
|
-
end
|
406
|
-
# puts "> Host is NOT allowed!"
|
407
|
-
return false
|
408
|
-
end
|
409
|
-
|
410
|
-
def url_allowed?(uri)
|
411
|
-
# puts "* excluded_urls"
|
412
|
-
# puts exluded_urls
|
413
|
-
return false if excluded_urls.select{ |url| uri.path =~ /#{url}/ }.length > 0
|
414
|
-
# puts "* allowed_urls"
|
415
|
-
# puts allowed_urls
|
416
|
-
return true if allowed_urls.empty?
|
417
|
-
return true if allowed_urls.select{ |url| uri.path =~ /#{url}/ }.length > 0
|
418
|
-
# puts "> URL is NOT allowed"
|
419
|
-
return false
|
420
|
-
end
|
421
|
-
|
422
|
-
def path_allowed?(uri)
|
423
|
-
return true if root_path.nil?
|
424
|
-
return true if root_path.empty?
|
425
|
-
return true if uri.path =~ /^#{root_path}/
|
426
|
-
# puts "> PATH is NOT ALLOWED"
|
427
|
-
return false
|
428
|
-
end
|
429
|
-
|
430
|
-
def cleanup_uri(obj)
|
431
|
-
uri = nil
|
432
|
-
uri = obj.uri if obj.respond_to? :uri
|
433
|
-
uri = URI.parse(obj) if obj.is_a? String
|
434
|
-
uri = obj if obj.is_a? URI::HTTP
|
435
|
-
uri
|
436
|
-
end
|
437
|
-
|
438
|
-
def allowed?(link)
|
439
|
-
valid = false
|
440
|
-
# need to handle different link objects, Mechanize::Page::Link and URIs
|
441
|
-
uri = nil
|
442
|
-
uri = link.uri if link.respond_to? :uri
|
443
|
-
uri = URI.parse(link) if link.is_a? String
|
444
|
-
uri = link if link.is_a? URI::HTTP
|
445
|
-
|
446
|
-
return false if uri.nil?
|
447
|
-
|
448
|
-
host_allowed?(uri) &&
|
449
|
-
url_allowed?(uri) &&
|
450
|
-
path_allowed?(uri)
|
451
|
-
end
|
452
|
-
|
453
|
-
def form_sent?(form)
|
454
|
-
|
455
|
-
@form_keys.has_key? form_key(form)
|
456
|
-
end
|
457
|
-
|
458
|
-
def link_key(link, opts={})
|
459
|
-
o = { :clear_values => false }
|
460
|
-
o.update opts
|
461
|
-
|
462
|
-
uri = cleanup_uri(link)
|
463
|
-
|
464
|
-
query_sorted = ""
|
465
|
-
query_sorted = uri.query.split("&").sort.join("&") unless uri.query.nil?
|
466
|
-
|
467
|
-
key = ""
|
468
|
-
key << uri.scheme
|
469
|
-
key << uri.site
|
470
|
-
key << uri.path
|
471
|
-
key << query_sorted
|
472
|
-
key.gsub!(/=[^&]*/,'=') if o[:clear_values] == true
|
473
|
-
|
474
|
-
Digest::MD5.hexdigest key
|
475
|
-
end
|
476
|
-
|
477
|
-
def engine_running?
|
478
|
-
@status_lock.synchronize do
|
479
|
-
return false if @engine_status == CRAWL_NONE
|
480
|
-
return true
|
481
|
-
end
|
482
|
-
end
|
483
|
-
|
484
|
-
def link_is_followed?(link)
|
485
|
-
|
486
|
-
return true if @link_keys.has_key? link_key(link)
|
487
|
-
|
488
|
-
false
|
489
|
-
end
|
490
|
-
|
491
|
-
def fields_allowed?(form)
|
492
|
-
form.fields.each do |f|
|
493
|
-
excluded_fields.each do |ef|
|
494
|
-
return false if f.name =~ /#{ef}/
|
495
|
-
end
|
496
|
-
end
|
497
|
-
return true
|
498
|
-
end
|
499
|
-
|
500
|
-
def method_missing(name, *args, &block)
|
501
|
-
# puts "* instance method missing (#{name})"
|
502
|
-
if name =~ /(.*)=$/
|
503
|
-
@opts.has_key? $1.to_sym || super
|
504
|
-
@opts[$1.to_sym] = args[0]
|
505
|
-
return @opts[$1.to_sym]
|
506
|
-
else
|
507
|
-
k = name.to_sym
|
508
|
-
@opts.has_key? k || super
|
509
|
-
# puts "Value Found For #{k.to_yaml}"
|
510
|
-
return @opts[k]
|
511
|
-
|
512
|
-
end
|
513
|
-
end
|
514
|
-
end
|
515
|
-
end
|
516
|
-
|
517
|
-
end
|
294
|
+
puts bang.backtrace if $DEBUG
|
295
|
+
end
|
296
|
+
end
|
297
|
+
|
298
|
+
end
|
299
|
+
|
300
|
+
|
301
|
+
def submit_link(link, depth)
|
302
|
+
# @link_keys[link_key(link)] = link
|
303
|
+
|
304
|
+
clk = link_key(link, :clear_values => true)
|
305
|
+
@link_counts[clk] ||= 0
|
306
|
+
@link_counts[clk] += 1
|
307
|
+
lk = link_key(link)
|
308
|
+
return false if @link_keys.has_key? lk
|
309
|
+
@link_keys[lk] = nil
|
310
|
+
if @link_counts[clk] < @opts[:max_repeat]
|
311
|
+
@link_queue.enq LinkBag.new(link, depth)
|
312
|
+
else
|
313
|
+
puts "! MAX REPEAT !\nSkipped link #{link}" if $DEBUG
|
314
|
+
end
|
315
|
+
end
|
316
|
+
|
317
|
+
def form_key(form, opts={} )
|
318
|
+
o = { :clear_values => false }
|
319
|
+
o.update opts
|
320
|
+
|
321
|
+
fp = "#{form.action}"
|
322
|
+
fp << form.method
|
323
|
+
if form.request_data =~ /=/
|
324
|
+
data = form.request_data.split("&").sort.join("&")
|
325
|
+
if o[:clear_values]
|
326
|
+
fp << data.gsub(/=[^&]*/,'=')
|
327
|
+
else
|
328
|
+
fp << data
|
329
|
+
end
|
330
|
+
end
|
331
|
+
fkey = Digest::MD5.hexdigest fp
|
332
|
+
fkey
|
333
|
+
end
|
334
|
+
|
335
|
+
def send_form(form, depth)
|
336
|
+
return false if @engine_status == CRAWL_NONE
|
337
|
+
cfk = form_key(form, :clear_values => true)
|
338
|
+
@form_counts[cfk] ||= 0
|
339
|
+
@form_counts[cfk] += 1
|
340
|
+
|
341
|
+
# @form_keys[form_key(form)] = form
|
342
|
+
fk = form_key(form)
|
343
|
+
return false if @form_keys.has_key? fk
|
344
|
+
@form_keys[fk] = nil
|
345
|
+
begin
|
346
|
+
if @form_counts[cfk] < @opts[:max_repeat]
|
347
|
+
if form.buttons.length > 0
|
348
|
+
p = form.click_button
|
349
|
+
else
|
350
|
+
p = form.submit()
|
351
|
+
end
|
352
|
+
puts p.class
|
353
|
+
@page_queue.enq PageBag.new(p, depth+1)
|
354
|
+
else
|
355
|
+
puts "! MAX REPEAT !\nSkipped Form #{form.action}"
|
356
|
+
end
|
357
|
+
rescue => bang
|
358
|
+
puts bang
|
359
|
+
puts bang.backtrace
|
360
|
+
end
|
361
|
+
end
|
362
|
+
|
363
|
+
def send_form?(form)
|
364
|
+
# puts "SEND FORM?"
|
365
|
+
return false unless engine_running?
|
366
|
+
return false unless @opts[:submit_forms] == true
|
367
|
+
# puts "> submit_forms"
|
368
|
+
return false unless allowed? form.action
|
369
|
+
#puts "> allowed"
|
370
|
+
return false unless fields_allowed? form
|
371
|
+
#puts "> fields allowed"
|
372
|
+
return false if form_sent? form
|
373
|
+
# puts "> form not sent"
|
374
|
+
return true
|
375
|
+
end
|
376
|
+
|
377
|
+
def follow_link?(link)
|
378
|
+
return false unless allowed? link
|
379
|
+
return false if link_is_followed? link
|
380
|
+
return true
|
381
|
+
end
|
382
|
+
|
383
|
+
def host_allowed?(uri)
|
384
|
+
#puts "ALLOWED HOSTS =>"
|
385
|
+
#puts allowed_hosts
|
386
|
+
#puts "---"
|
387
|
+
# puts "Host Allowed?"
|
388
|
+
ah = allowed_hosts
|
389
|
+
# puts ah.class
|
390
|
+
#puts ah
|
391
|
+
return false if ah.empty?
|
392
|
+
ahc = ah.select{ |h| uri.site =~ /^#{h}$/ }.length
|
393
|
+
if ahc > 0
|
394
|
+
# puts "> Host IS allowed!"
|
395
|
+
return true
|
396
|
+
end
|
397
|
+
# puts "> Host is NOT allowed!"
|
398
|
+
return false
|
399
|
+
end
|
400
|
+
|
401
|
+
def url_allowed?(uri)
|
402
|
+
# puts "* excluded_urls"
|
403
|
+
# puts exluded_urls
|
404
|
+
return false if excluded_urls.select{ |url| uri.path =~ /#{url}/ }.length > 0
|
405
|
+
# puts "* allowed_urls"
|
406
|
+
# puts allowed_urls
|
407
|
+
return true if allowed_urls.empty?
|
408
|
+
return true if allowed_urls.select{ |url| uri.path =~ /#{url}/ }.length > 0
|
409
|
+
# puts "> URL is NOT allowed"
|
410
|
+
return false
|
411
|
+
end
|
412
|
+
|
413
|
+
def path_allowed?(uri)
|
414
|
+
return true if root_path.nil?
|
415
|
+
return true if root_path.empty?
|
416
|
+
return true if uri.path =~ /^#{root_path}/
|
417
|
+
# puts "> PATH is NOT ALLOWED"
|
418
|
+
return false
|
419
|
+
end
|
420
|
+
|
421
|
+
def cleanup_uri(obj)
|
422
|
+
uri = nil
|
423
|
+
uri = obj.uri if obj.respond_to? :uri
|
424
|
+
uri = URI.parse(obj) if obj.is_a? String
|
425
|
+
uri = obj if obj.is_a? URI::HTTP
|
426
|
+
uri
|
427
|
+
end
|
428
|
+
|
429
|
+
def allowed?(link)
|
430
|
+
valid = false
|
431
|
+
# need to handle different link objects, Mechanize::Page::Link and URIs
|
432
|
+
uri = nil
|
433
|
+
uri = link.uri if link.respond_to? :uri
|
434
|
+
uri = URI.parse(link) if link.is_a? String
|
435
|
+
uri = link if link.is_a? URI::HTTP
|
436
|
+
|
437
|
+
return false if uri.nil?
|
438
|
+
|
439
|
+
host_allowed?(uri) &&
|
440
|
+
url_allowed?(uri) &&
|
441
|
+
path_allowed?(uri)
|
442
|
+
end
|
443
|
+
|
444
|
+
def form_sent?(form)
|
445
|
+
|
446
|
+
@form_keys.has_key? form_key(form)
|
447
|
+
end
|
448
|
+
|
449
|
+
def link_key(link, opts={})
|
450
|
+
o = { :clear_values => false }
|
451
|
+
o.update opts
|
452
|
+
|
453
|
+
uri = cleanup_uri(link)
|
454
|
+
|
455
|
+
query_sorted = ""
|
456
|
+
query_sorted = uri.query.split("&").sort.join("&") unless uri.query.nil?
|
457
|
+
|
458
|
+
key = ""
|
459
|
+
key << uri.scheme
|
460
|
+
key << uri.site
|
461
|
+
key << uri.path
|
462
|
+
key << query_sorted
|
463
|
+
key.gsub!(/=[^&]*/,'=') if o[:clear_values] == true
|
464
|
+
|
465
|
+
Digest::MD5.hexdigest key
|
466
|
+
end
|
467
|
+
|
468
|
+
def engine_running?
|
469
|
+
@status_lock.synchronize do
|
470
|
+
return false if @engine_status == CRAWL_NONE
|
471
|
+
return true
|
472
|
+
end
|
473
|
+
end
|
474
|
+
|
475
|
+
def link_is_followed?(link)
|
476
|
+
|
477
|
+
return true if @link_keys.has_key? link_key(link)
|
478
|
+
|
479
|
+
false
|
480
|
+
end
|
481
|
+
|
482
|
+
def fields_allowed?(form)
|
483
|
+
form.fields.each do |f|
|
484
|
+
excluded_fields.each do |ef|
|
485
|
+
return false if f.name =~ /#{ef}/
|
486
|
+
end
|
487
|
+
end
|
488
|
+
return true
|
489
|
+
end
|
490
|
+
|
491
|
+
def method_missing(name, *args, &block)
|
492
|
+
# puts "* instance method missing (#{name})"
|
493
|
+
if name =~ /(.*)=$/
|
494
|
+
@opts.has_key? $1.to_sym || super
|
495
|
+
@opts[$1.to_sym] = args[0]
|
496
|
+
return @opts[$1.to_sym]
|
497
|
+
else
|
498
|
+
k = name.to_sym
|
499
|
+
@opts.has_key? k || super
|
500
|
+
# puts "Value Found For #{k.to_yaml}"
|
501
|
+
return @opts[k]
|
502
|
+
|
503
|
+
end
|
504
|
+
end
|
505
|
+
end
|
506
|
+
end
|
507
|
+
|
508
|
+
end
|