RubyGems - google_robotstxt_parser - Versions diffs - 0.0.3 - Mend

google_robotstxt_parser 0.0.3

Files changed (33) hide show

checksums.yaml +7 -0
data/.gitignore +28 -0
data/.gitmodules +3 -0
data/CHANGELOG.md +5 -0
data/CODE_OF_CONDUCT.md +46 -0
data/Gemfile +6 -0
data/Guardfile +16 -0
data/LICENSE +22 -0
data/README.md +57 -0
data/Rakefile +6 -0
data/ext/robotstxt/.DS_Store +0 -0
data/ext/robotstxt/extconf.rb +83 -0
data/ext/robotstxt/robotstxt/.gitignore +1 -0
data/ext/robotstxt/robotstxt/BUILD +40 -0
data/ext/robotstxt/robotstxt/CMakeLists.txt +174 -0
data/ext/robotstxt/robotstxt/CMakeLists.txt.in +30 -0
data/ext/robotstxt/robotstxt/CONTRIBUTING.md +30 -0
data/ext/robotstxt/robotstxt/LICENSE +203 -0
data/ext/robotstxt/robotstxt/README.md +134 -0
data/ext/robotstxt/robotstxt/WORKSPACE +28 -0
data/ext/robotstxt/robotstxt/protocol-draft/README.md +9 -0
data/ext/robotstxt/robotstxt/protocol-draft/draft-koster-rep-00.txt +529 -0
data/ext/robotstxt/robotstxt/robots.cc +706 -0
data/ext/robotstxt/robotstxt/robots.h +241 -0
data/ext/robotstxt/robotstxt/robots_main.cc +101 -0
data/ext/robotstxt/robotstxt/robots_test.cc +990 -0
data/ext/robotstxt/robotstxt.cc +32 -0
data/google_robotstxt_parser.gemspec +45 -0
data/lib/google_robotstxt_parser/version.rb +6 -0
data/lib/google_robotstxt_parser.rb +4 -0
data/spec/google_robotstxt_parser_spec.rb +33 -0
data/spec/spec_helper.rb +19 -0
metadata +146 -0

data/ext/robotstxt/robotstxt/robots.h ADDED Viewed

@@ -0,0 +1,241 @@
+// Copyright 1999 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// -----------------------------------------------------------------------------
+// File: robots.h
+// -----------------------------------------------------------------------------
+//
+// This file implements the standard defined by the Robots Exclusion Protocol
+// (REP) internet draft (I-D).
+//   https://tools.ietf.org/html/draft-koster-rep
+//
+// Google doesn't follow the standard strictly, because there are a lot of
+// non-conforming robots.txt files out there, and we err on the side of
+// disallowing when this seems intended.
+//
+// An more user-friendly description of how Google handles robots.txt can be
+// found at:
+//   https://developers.google.com/search/reference/robots_txt
+//
+// This library provides a low-level parser for robots.txt (ParseRobotsTxt()),
+// and a matcher for URLs against a robots.txt (class RobotsMatcher).
+#ifndef THIRD_PARTY_ROBOTSTXT_ROBOTS_H__
+#define THIRD_PARTY_ROBOTSTXT_ROBOTS_H__
+#include <string>
+#include <vector>
+#include "absl/strings/string_view.h"
+namespace googlebot {
+// Handler for directives found in robots.txt. These callbacks are called by
+// ParseRobotsTxt() in the sequence they have been found in the file.
+class RobotsParseHandler {
+ public:
+  RobotsParseHandler() {}
+  virtual ~RobotsParseHandler() {}
+  // Disallow copying and assignment.
+  RobotsParseHandler(const RobotsParseHandler&) = delete;
+  RobotsParseHandler& operator=(const RobotsParseHandler&) = delete;
+  virtual void HandleRobotsStart() = 0;
+  virtual void HandleRobotsEnd() = 0;
+  virtual void HandleUserAgent(int line_num, absl::string_view value) = 0;
+  virtual void HandleAllow(int line_num, absl::string_view value) = 0;
+  virtual void HandleDisallow(int line_num, absl::string_view value) = 0;
+  virtual void HandleSitemap(int line_num, absl::string_view value) = 0;
+  // Any other unrecognized name/value pairs.
+  virtual void HandleUnknownAction(int line_num, absl::string_view action,
+                                   absl::string_view value) = 0;
+};
+// Parses body of a robots.txt and emits parse callbacks. This will accept
+// typical typos found in robots.txt, such as 'disalow'.
+//
+// Note, this function will accept all kind of input but will skip
+// everything that does not look like a robots directive.
+void ParseRobotsTxt(absl::string_view robots_body,
+                    RobotsParseHandler* parse_callback);
+// RobotsMatcher - matches robots.txt against URLs.
+//
+// The Matcher uses a default match strategy for Allow/Disallow patterns which
+// is the official way of Google crawler to match robots.txt. It is also
+// possible to provide a custom match strategy.
+//
+// The entry point for the user is to call one of the *AllowedByRobots()
+// methods that return directly if a URL is being allowed according to the
+// robots.txt and the crawl agent.
+// The RobotsMatcher can be re-used for URLs/robots.txt but is not thread-safe.
+class RobotsMatchStrategy;
+class RobotsMatcher : protected RobotsParseHandler {
+ public:
+  // Create a RobotsMatcher with the default matching strategy. The default
+  // matching strategy is longest-match as opposed to the former internet draft
+  // that provisioned first-match strategy. Analysis shows that longest-match,
+  // while more restrictive for crawlers, is what webmasters assume when writing
+  // directives. For example, in case of conflicting matches (both Allow and
+  // Disallow), the longest match is the one the user wants. For example, in
+  // case of a robots.txt file that has the following rules
+  //   Allow: /
+  //   Disallow: /cgi-bin
+  // it's pretty obvious what the webmaster wants: they want to allow crawl of
+  // every URI except /cgi-bin. However, according to the expired internet
+  // standard, crawlers should be allowed to crawl everything with such a rule.
+  RobotsMatcher();
+  ~RobotsMatcher() override;
+  // Disallow copying and assignment.
+  RobotsMatcher(const RobotsMatcher&) = delete;
+  RobotsMatcher& operator=(const RobotsMatcher&) = delete;
+  // Verifies that the given user agent is valid to be matched against
+  // robots.txt. Valid user agent strings only contain the characters
+  // [a-zA-Z_-].
+  static bool IsValidUserAgentToObey(absl::string_view user_agent);
+  // Returns true iff 'url' is allowed to be fetched by any member of the
+  // "user_agents" vector. 'url' must be %-encoded according to RFC3986.
+  bool AllowedByRobots(absl::string_view robots_body,
+                       const std::vector<std::string>* user_agents,
+                       const std::string& url);
+  // Do robots check for 'url' when there is only one user agent. 'url' must
+  // be %-encoded according to RFC3986.
+  bool OneAgentAllowedByRobots(absl::string_view robots_txt,
+                               const std::string& user_agent,
+                               const std::string& url);
+  // Returns true if we are disallowed from crawling a matching URI.
+  bool disallow() const;
+  // Returns true if we are disallowed from crawling a matching URI. Ignores any
+  // rules specified for the default user agent, and bases its results only on
+  // the specified user agents.
+  bool disallow_ignore_global() const;
+  // Returns true iff, when AllowedByRobots() was called, the robots file
+  // referred explicitly to one of the specified user agents.
+  bool ever_seen_specific_agent() const;
+  // Returns the line that matched or 0 if none matched.
+  const int matching_line() const;
+ protected:
+  // Parse callbacks.
+  // Protected because used in unittest. Never override RobotsMatcher, implement
+  // googlebot::RobotsParseHandler instead.
+  void HandleRobotsStart() override;
+  void HandleRobotsEnd() override {}
+  void HandleUserAgent(int line_num, absl::string_view value) override;
+  void HandleAllow(int line_num, absl::string_view value) override;
+  void HandleDisallow(int line_num, absl::string_view value) override;
+  void HandleSitemap(int line_num, absl::string_view value) override;
+  void HandleUnknownAction(int line_num, absl::string_view action,
+                           absl::string_view value) override;
+ protected:
+  // Extract the matchable part of a user agent string, essentially stopping at
+  // the first invalid character.
+  // Example: 'Googlebot/2.1' becomes 'Googlebot'
+  static absl::string_view ExtractUserAgent(absl::string_view user_agent);
+  // Initialize next path and user-agents to check. Path must contain only the
+  // path, params, and query (if any) of the url and must start with a '/'.
+  void InitUserAgentsAndPath(const std::vector<std::string>* user_agents,
+                             const char* path);
+  // Returns true if any user-agent was seen.
+  bool seen_any_agent() const {
+    return seen_global_agent_ || seen_specific_agent_;
+  }
+  // Instead of just maintaining a Boolean indicating whether a given line has
+  // matched, we maintain a count of the maximum number of characters matched by
+  // that pattern.
+  //
+  // This structure stores the information associated with a match (e.g. when a
+  // Disallow is matched) as priority of the match and line matching.
+  //
+  // The priority is initialized with a negative value to make sure that a match
+  // of priority 0 is higher priority than no match at all.
+  class Match {
+   private:
+    static const int kNoMatchPriority = -1;
+   public:
+    Match(int priority, int line) : priority_(priority), line_(line) {}
+    Match() : priority_(kNoMatchPriority), line_(0) {}
+    void Set(int priority, int line) {
+      priority_ = priority;
+      line_ = line;
+    }
+    void Clear() { Set(kNoMatchPriority, 0); }
+    int line() const { return line_; }
+    int priority() const { return priority_; }
+    static const Match& HigherPriorityMatch(const Match& a, const Match& b) {
+      if (a.priority() > b.priority()) {
+        return a;
+      } else {
+        return b;
+      }
+    }
+   private:
+    int priority_;
+    int line_;
+  };
+  // For each of the directives within user-agents, we keep global and specific
+  // match scores.
+  struct MatchHierarchy {
+    Match global;            // Match for '*'
+    Match specific;          // Match for queried agent.
+    void Clear() {
+      global.Clear();
+      specific.Clear();
+    }
+  };
+  MatchHierarchy allow_;       // Characters of 'url' matching Allow.
+  MatchHierarchy disallow_;    // Characters of 'url' matching Disallow.
+  bool seen_global_agent_;         // True if processing global agent rules.
+  bool seen_specific_agent_;       // True if processing our specific agent.
+  bool ever_seen_specific_agent_;  // True if we ever saw a block for our agent.
+  bool seen_separator_;            // True if saw any key: value pair.
+  // The path we want to pattern match. Not owned and only a valid pointer
+  // during the lifetime of *AllowedByRobots calls.
+  const char* path_;
+  // The User-Agents we are interested in. Not owned and only a valid
+  // pointer during the lifetime of *AllowedByRobots calls.
+  const std::vector<std::string>* user_agents_;
+  RobotsMatchStrategy* match_strategy_;
+};
+}  // namespace googlebot
+#endif  // THIRD_PARTY_ROBOTSTXT_ROBOTS_H__

data/ext/robotstxt/robotstxt/robots_main.cc ADDED Viewed

@@ -0,0 +1,101 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// -----------------------------------------------------------------------------
+// File: robots_main.cc
+// -----------------------------------------------------------------------------
+//
+// Simple binary to assess whether a URL is accessible to a user-agent according
+// to records found in a local robots.txt file, based on Google's robots.txt
+// parsing and matching algorithms.
+// Usage:
+//     robots_main <local_path_to_robotstxt> <user_agent> <url>
+// Arguments:
+// local_path_to_robotstxt: local path to a file containing robots.txt records.
+//   For example: /home/users/username/robots.txt
+// user_agent: a token to be matched against records in the robots.txt.
+//   For example: Googlebot
+// url: a url to be matched against records in the robots.txt. The URL must be
+// %-encoded according to RFC3986.
+//   For example: https://example.com/accessible/url.html
+// Returns: Prints a sentence with verdict about whether 'user_agent' is allowed
+// to access 'url' based on records in 'local_path_to_robotstxt'.
+//
+#include <fstream>
+#include <iostream>
+#include "robots.h"
+bool LoadFile(const std::string& filename, std::string* result) {
+  std::ifstream file(filename, std::ios::in | std::ios::binary | std::ios::ate);
+  if (file.is_open()) {
+    size_t size = file.tellg();
+    std::vector<char> buffer(size);
+    file.seekg(0, std::ios::beg);
+    file.read(buffer.data(), size);
+    file.close();
+    if (!file) return false;  // file reading error (failbit or badbit).
+    result->assign(buffer.begin(), buffer.end());
+    return true;
+  }
+  return false;
+}
+void ShowHelp(int argc, char** argv) {
+  std::cerr << "Shows whether the given user_agent and URI combination"
+            << " is allowed or disallowed by the given robots.txt file. "
+            << std::endl
+            << std::endl;
+  std::cerr << "Usage: " << std::endl
+            << "  " << argv[0] << " <robots.txt filename> <user_agent> <URI>"
+            << std::endl
+            << std::endl;
+  std::cerr << "The URI must be %-encoded according to RFC3986." << std::endl
+            << std::endl;
+  std::cerr << "Example: " << std::endl
+            << "  " << argv[0] << " robots.txt FooBot http://example.com/foo"
+            << std::endl;
+}
+int main(int argc, char** argv) {
+  std::string filename = argc >= 2 ? argv[1] : "";
+  if (filename == "-h" || filename == "-help" || filename == "--help") {
+    ShowHelp(argc, argv);
+    return 0;
+  }
+  if (argc != 4) {
+    std::cerr << "Invalid amount of arguments. Showing help." << std::endl
+              << std::endl;
+    ShowHelp(argc, argv);
+    return 1;
+  }
+  std::string robots_content;
+  if (!(LoadFile(filename, &robots_content))) {
+    std::cerr << "failed to read file \"" << filename << "\"" << std::endl;
+    return 1;
+  }
+  std::string user_agent = argv[2];
+  std::vector<std::string> user_agents(1, user_agent);
+  googlebot::RobotsMatcher matcher;
+  std::string url = argv[3];
+  bool allowed = matcher.AllowedByRobots(robots_content, &user_agents, url);
+  std::cout << "user-agent '" << user_agent << "' with URI '" << argv[3]
+            << "': " << (allowed ? "ALLOWED" : "DISALLOWED") << std::endl;
+  if (robots_content.empty()) {
+    std::cout << "notice: robots file is empty so all user-agents are allowed"
+              << std::endl;
+  }
+}