RubyGems - protoc - Versions diffs - 2.6.1.1 → 2.6.1.2 - Mend

protoc 2.6.1.1 → 2.6.1.2

Files changed (479) hide show

data/ext/protoc/protobuf/src/google/protobuf/io/tokenizer.h ADDED

@@ -0,0 +1,402 @@
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// Author: kenton@google.com (Kenton Varda)
+//  Based on original Protocol Buffers design by
+//  Sanjay Ghemawat, Jeff Dean, and others.
+//
+// Class for parsing tokenized text from a ZeroCopyInputStream.
+#ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__
+#define GOOGLE_PROTOBUF_IO_TOKENIZER_H__
+#include <string>
+#include <vector>
+#include <google/protobuf/stubs/common.h>
+namespace google {
+namespace protobuf {
+namespace io {
+class ZeroCopyInputStream;     // zero_copy_stream.h
+// Defined in this file.
+class ErrorCollector;
+class Tokenizer;
+// Abstract interface for an object which collects the errors that occur
+// during parsing.  A typical implementation might simply print the errors
+// to stdout.
+class LIBPROTOBUF_EXPORT ErrorCollector {
+ public:
+  inline ErrorCollector() {}
+  virtual ~ErrorCollector();
+  // Indicates that there was an error in the input at the given line and
+  // column numbers.  The numbers are zero-based, so you may want to add
+  // 1 to each before printing them.
+  virtual void AddError(int line, int column, const string& message) = 0;
+  // Indicates that there was a warning in the input at the given line and
+  // column numbers.  The numbers are zero-based, so you may want to add
+  // 1 to each before printing them.
+  virtual void AddWarning(int /* line */, int /* column */,
+                          const string& /* message */) { }
+ private:
+  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ErrorCollector);
+};
+// This class converts a stream of raw text into a stream of tokens for
+// the protocol definition parser to parse.  The tokens recognized are
+// similar to those that make up the C language; see the TokenType enum for
+// precise descriptions.  Whitespace and comments are skipped.  By default,
+// C- and C++-style comments are recognized, but other styles can be used by
+// calling set_comment_style().
+class LIBPROTOBUF_EXPORT Tokenizer {
+ public:
+  // Construct a Tokenizer that reads and tokenizes text from the given
+  // input stream and writes errors to the given error_collector.
+  // The caller keeps ownership of input and error_collector.
+  Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector);
+  ~Tokenizer();
+  enum TokenType {
+    TYPE_START,       // Next() has not yet been called.
+    TYPE_END,         // End of input reached.  "text" is empty.
+    TYPE_IDENTIFIER,  // A sequence of letters, digits, and underscores, not
+                      // starting with a digit.  It is an error for a number
+                      // to be followed by an identifier with no space in
+                      // between.
+    TYPE_INTEGER,     // A sequence of digits representing an integer.  Normally
+                      // the digits are decimal, but a prefix of "0x" indicates
+                      // a hex number and a leading zero indicates octal, just
+                      // like with C numeric literals.  A leading negative sign
+                      // is NOT included in the token; it's up to the parser to
+                      // interpret the unary minus operator on its own.
+    TYPE_FLOAT,       // A floating point literal, with a fractional part and/or
+                      // an exponent.  Always in decimal.  Again, never
+                      // negative.
+    TYPE_STRING,      // A quoted sequence of escaped characters.  Either single
+                      // or double quotes can be used, but they must match.
+                      // A string literal cannot cross a line break.
+    TYPE_SYMBOL,      // Any other printable character, like '!' or '+'.
+                      // Symbols are always a single character, so "!+$%" is
+                      // four tokens.
+  };
+  // Structure representing a token read from the token stream.
+  struct Token {
+    TokenType type;
+    string text;       // The exact text of the token as it appeared in
+                       // the input.  e.g. tokens of TYPE_STRING will still
+                       // be escaped and in quotes.
+    // "line" and "column" specify the position of the first character of
+    // the token within the input stream.  They are zero-based.
+    int line;
+    int column;
+    int end_column;
+  };
+  // Get the current token.  This is updated when Next() is called.  Before
+  // the first call to Next(), current() has type TYPE_START and no contents.
+  const Token& current();
+  // Return the previous token -- i.e. what current() returned before the
+  // previous call to Next().
+  const Token& previous();
+  // Advance to the next token.  Returns false if the end of the input is
+  // reached.
+  bool Next();
+  // Like Next(), but also collects comments which appear between the previous
+  // and next tokens.
+  //
+  // Comments which appear to be attached to the previous token are stored
+  // in *prev_tailing_comments.  Comments which appear to be attached to the
+  // next token are stored in *next_leading_comments.  Comments appearing in
+  // between which do not appear to be attached to either will be added to
+  // detached_comments.  Any of these parameters can be NULL to simply discard
+  // the comments.
+  //
+  // A series of line comments appearing on consecutive lines, with no other
+  // tokens appearing on those lines, will be treated as a single comment.
+  //
+  // Only the comment content is returned; comment markers (e.g. //) are
+  // stripped out.  For block comments, leading whitespace and an asterisk will
+  // be stripped from the beginning of each line other than the first.  Newlines
+  // are included in the output.
+  //
+  // Examples:
+  //
+  //   optional int32 foo = 1;  // Comment attached to foo.
+  //   // Comment attached to bar.
+  //   optional int32 bar = 2;
+  //
+  //   optional string baz = 3;
+  //   // Comment attached to baz.
+  //   // Another line attached to baz.
+  //
+  //   // Comment attached to qux.
+  //   //
+  //   // Another line attached to qux.
+  //   optional double qux = 4;
+  //
+  //   // Detached comment.  This is not attached to qux or corge
+  //   // because there are blank lines separating it from both.
+  //
+  //   optional string corge = 5;
+  //   /* Block comment attached
+  //    * to corge.  Leading asterisks
+  //    * will be removed. */
+  //   /* Block comment attached to
+  //    * grault. */
+  //   optional int32 grault = 6;
+  bool NextWithComments(string* prev_trailing_comments,
+                        vector<string>* detached_comments,
+                        string* next_leading_comments);
+  // Parse helpers ---------------------------------------------------
+  // Parses a TYPE_FLOAT token.  This never fails, so long as the text actually
+  // comes from a TYPE_FLOAT token parsed by Tokenizer.  If it doesn't, the
+  // result is undefined (possibly an assert failure).
+  static double ParseFloat(const string& text);
+  // Parses a TYPE_STRING token.  This never fails, so long as the text actually
+  // comes from a TYPE_STRING token parsed by Tokenizer.  If it doesn't, the
+  // result is undefined (possibly an assert failure).
+  static void ParseString(const string& text, string* output);
+  // Identical to ParseString, but appends to output.
+  static void ParseStringAppend(const string& text, string* output);
+  // Parses a TYPE_INTEGER token.  Returns false if the result would be
+  // greater than max_value.  Otherwise, returns true and sets *output to the
+  // result.  If the text is not from a Token of type TYPE_INTEGER originally
+  // parsed by a Tokenizer, the result is undefined (possibly an assert
+  // failure).
+  static bool ParseInteger(const string& text, uint64 max_value,
+                           uint64* output);
+  // Options ---------------------------------------------------------
+  // Set true to allow floats to be suffixed with the letter 'f'.  Tokens
+  // which would otherwise be integers but which have the 'f' suffix will be
+  // forced to be interpreted as floats.  For all other purposes, the 'f' is
+  // ignored.
+  void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; }
+  // Valid values for set_comment_style().
+  enum CommentStyle {
+    // Line comments begin with "//", block comments are delimited by "/*" and
+    // "*/".
+    CPP_COMMENT_STYLE,
+    // Line comments begin with "#".  No way to write block comments.
+    SH_COMMENT_STYLE
+  };
+  // Sets the comment style.
+  void set_comment_style(CommentStyle style) { comment_style_ = style; }
+  // Whether to require whitespace between a number and a field name.
+  // Default is true. Do not use this; for Google-internal cleanup only.
+  void set_require_space_after_number(bool require) {
+    require_space_after_number_ = require;
+  }
+  // Whether to allow string literals to span multiple lines. Default is false.
+  // Do not use this; for Google-internal cleanup only.
+  void set_allow_multiline_strings(bool allow) {
+    allow_multiline_strings_ = allow;
+  }
+  // External helper: validate an identifier.
+  static bool IsIdentifier(const string& text);
+  // -----------------------------------------------------------------
+ private:
+  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Tokenizer);
+  Token current_;           // Returned by current().
+  Token previous_;          // Returned by previous().
+  ZeroCopyInputStream* input_;
+  ErrorCollector* error_collector_;
+  char current_char_;       // == buffer_[buffer_pos_], updated by NextChar().
+  const char* buffer_;      // Current buffer returned from input_.
+  int buffer_size_;         // Size of buffer_.
+  int buffer_pos_;          // Current position within the buffer.
+  bool read_error_;         // Did we previously encounter a read error?
+  // Line and column number of current_char_ within the whole input stream.
+  int line_;
+  int column_;
+  // String to which text should be appended as we advance through it.
+  // Call RecordTo(&str) to start recording and StopRecording() to stop.
+  // E.g. StartToken() calls RecordTo(&current_.text).  record_start_ is the
+  // position within the current buffer where recording started.
+  string* record_target_;
+  int record_start_;
+  // Options.
+  bool allow_f_after_float_;
+  CommentStyle comment_style_;
+  bool require_space_after_number_;
+  bool allow_multiline_strings_;
+  // Since we count columns we need to interpret tabs somehow.  We'll take
+  // the standard 8-character definition for lack of any way to do better.
+  static const int kTabWidth = 8;
+  // -----------------------------------------------------------------
+  // Helper methods.
+  // Consume this character and advance to the next one.
+  void NextChar();
+  // Read a new buffer from the input.
+  void Refresh();
+  inline void RecordTo(string* target);
+  inline void StopRecording();
+  // Called when the current character is the first character of a new
+  // token (not including whitespace or comments).
+  inline void StartToken();
+  // Called when the current character is the first character after the
+  // end of the last token.  After this returns, current_.text will
+  // contain all text consumed since StartToken() was called.
+  inline void EndToken();
+  // Convenience method to add an error at the current line and column.
+  void AddError(const string& message) {
+    error_collector_->AddError(line_, column_, message);
+  }
+  // -----------------------------------------------------------------
+  // The following four methods are used to consume tokens of specific
+  // types.  They are actually used to consume all characters *after*
+  // the first, since the calling function consumes the first character
+  // in order to decide what kind of token is being read.
+  // Read and consume a string, ending when the given delimiter is
+  // consumed.
+  void ConsumeString(char delimiter);
+  // Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER
+  // depending on what was read.  This needs to know if the first
+  // character was a zero in order to correctly recognize hex and octal
+  // numbers.
+  // It also needs to know if the first characted was a . to parse floating
+  // point correctly.
+  TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot);
+  // Consume the rest of a line.
+  void ConsumeLineComment(string* content);
+  // Consume until "*/".
+  void ConsumeBlockComment(string* content);
+  enum NextCommentStatus {
+    // Started a line comment.
+    LINE_COMMENT,
+    // Started a block comment.
+    BLOCK_COMMENT,
+    // Consumed a slash, then realized it wasn't a comment.  current_ has
+    // been filled in with a slash token.  The caller should return it.
+    SLASH_NOT_COMMENT,
+    // We do not appear to be starting a comment here.
+    NO_COMMENT
+  };
+  // If we're at the start of a new comment, consume it and return what kind
+  // of comment it is.
+  NextCommentStatus TryConsumeCommentStart();
+  // -----------------------------------------------------------------
+  // These helper methods make the parsing code more readable.  The
+  // "character classes" refered to are defined at the top of the .cc file.
+  // Basically it is a C++ class with one method:
+  //   static bool InClass(char c);
+  // The method returns true if c is a member of this "class", like "Letter"
+  // or "Digit".
+  // Returns true if the current character is of the given character
+  // class, but does not consume anything.
+  template<typename CharacterClass>
+  inline bool LookingAt();
+  // If the current character is in the given class, consume it and return
+  // true.  Otherwise return false.
+  // e.g. TryConsumeOne<Letter>()
+  template<typename CharacterClass>
+  inline bool TryConsumeOne();
+  // Like above, but try to consume the specific character indicated.
+  inline bool TryConsume(char c);
+  // Consume zero or more of the given character class.
+  template<typename CharacterClass>
+  inline void ConsumeZeroOrMore();
+  // Consume one or more of the given character class or log the given
+  // error message.
+  // e.g. ConsumeOneOrMore<Digit>("Expected digits.");
+  template<typename CharacterClass>
+  inline void ConsumeOneOrMore(const char* error);
+};
+// inline methods ====================================================
+inline const Tokenizer::Token& Tokenizer::current() {
+  return current_;
+}
+inline const Tokenizer::Token& Tokenizer::previous() {
+  return previous_;
+}
+inline void Tokenizer::ParseString(const string& text, string* output) {
+  output->clear();
+  ParseStringAppend(text, output);
+}
+}  // namespace io
+}  // namespace protobuf
+}  // namespace google
+#endif  // GOOGLE_PROTOBUF_IO_TOKENIZER_H__

data/ext/protoc/protobuf/src/google/protobuf/io/tokenizer_unittest.cc ADDED

@@ -0,0 +1,999 @@
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// Author: kenton@google.com (Kenton Varda)
+//  Based on original Protocol Buffers design by
+//  Sanjay Ghemawat, Jeff Dean, and others.
+#include <limits.h>
+#include <math.h>
+#include <vector>
+#include <google/protobuf/io/tokenizer.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include <google/protobuf/stubs/common.h>
+#include <google/protobuf/stubs/strutil.h>
+#include <google/protobuf/stubs/substitute.h>
+#include <google/protobuf/testing/googletest.h>
+#include <gtest/gtest.h>
+namespace google {
+namespace protobuf {
+namespace io {
+namespace {
+// ===================================================================
+// Data-Driven Test Infrastructure
+// TODO(kenton):  This is copied from coded_stream_unittest.  This is
+//   temporary until these fetaures are integrated into gTest itself.
+// TEST_1D and TEST_2D are macros I'd eventually like to see added to
+// gTest.  These macros can be used to declare tests which should be
+// run multiple times, once for each item in some input array.  TEST_1D
+// tests all cases in a single input array.  TEST_2D tests all
+// combinations of cases from two arrays.  The arrays must be statically
+// defined such that the GOOGLE_ARRAYSIZE() macro works on them.  Example:
+//
+// int kCases[] = {1, 2, 3, 4}
+// TEST_1D(MyFixture, MyTest, kCases) {
+//   EXPECT_GT(kCases_case, 0);
+// }
+//
+// This test iterates through the numbers 1, 2, 3, and 4 and tests that
+// they are all grater than zero.  In case of failure, the exact case
+// which failed will be printed.  The case type must be printable using
+// ostream::operator<<.
+#define TEST_1D(FIXTURE, NAME, CASES)                                      \
+  class FIXTURE##_##NAME##_DD : public FIXTURE {                           \
+   protected:                                                              \
+    template <typename CaseType>                                           \
+    void DoSingleCase(const CaseType& CASES##_case);                       \
+  };                                                                       \
+                                                                           \
+  TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                    \
+    for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES); i++) {                           \
+      SCOPED_TRACE(testing::Message()                                      \
+        << #CASES " case #" << i << ": " << CASES[i]);                     \
+      DoSingleCase(CASES[i]);                                              \
+    }                                                                      \
+  }                                                                        \
+                                                                           \
+  template <typename CaseType>                                             \
+  void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
+#define TEST_2D(FIXTURE, NAME, CASES1, CASES2)                             \
+  class FIXTURE##_##NAME##_DD : public FIXTURE {                           \
+   protected:                                                              \
+    template <typename CaseType1, typename CaseType2>                      \
+    void DoSingleCase(const CaseType1& CASES1##_case,                      \
+                      const CaseType2& CASES2##_case);                     \
+  };                                                                       \
+                                                                           \
+  TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                    \
+    for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES1); i++) {                          \
+      for (int j = 0; j < GOOGLE_ARRAYSIZE(CASES2); j++) {                        \
+        SCOPED_TRACE(testing::Message()                                    \
+          << #CASES1 " case #" << i << ": " << CASES1[i] << ", "           \
+          << #CASES2 " case #" << j << ": " << CASES2[j]);                 \
+        DoSingleCase(CASES1[i], CASES2[j]);                                \
+      }                                                                    \
+    }                                                                      \
+  }                                                                        \
+                                                                           \
+  template <typename CaseType1, typename CaseType2>                        \
+  void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \
+                                           const CaseType2& CASES2##_case)
+// -------------------------------------------------------------------
+// An input stream that is basically like an ArrayInputStream but sometimes
+// returns empty buffers, just to throw us off.
+class TestInputStream : public ZeroCopyInputStream {
+ public:
+  TestInputStream(const void* data, int size, int block_size)
+    : array_stream_(data, size, block_size), counter_(0) {}
+  ~TestInputStream() {}
+  // implements ZeroCopyInputStream ----------------------------------
+  bool Next(const void** data, int* size) {
+    // We'll return empty buffers starting with the first buffer, and every
+    // 3 and 5 buffers after that.
+    if (counter_ % 3 == 0 || counter_ % 5 == 0) {
+      *data = NULL;
+      *size = 0;
+      ++counter_;
+      return true;
+    } else {
+      ++counter_;
+      return array_stream_.Next(data, size);
+    }
+  }
+  void BackUp(int count)  { return array_stream_.BackUp(count); }
+  bool Skip(int count)    { return array_stream_.Skip(count);   }
+  int64 ByteCount() const { return array_stream_.ByteCount();   }
+ private:
+  ArrayInputStream array_stream_;
+  int counter_;
+};
+// -------------------------------------------------------------------
+// An error collector which simply concatenates all its errors into a big
+// block of text which can be checked.
+class TestErrorCollector : public ErrorCollector {
+ public:
+  TestErrorCollector() {}
+  ~TestErrorCollector() {}
+  string text_;
+  // implements ErrorCollector ---------------------------------------
+  void AddError(int line, int column, const string& message) {
+    strings::SubstituteAndAppend(&text_, "$0:$1: $2\n",
+                                 line, column, message);
+  }
+};
+// -------------------------------------------------------------------
+// We test each operation over a variety of block sizes to insure that
+// we test cases where reads cross buffer boundaries as well as cases
+// where they don't.  This is sort of a brute-force approach to this,
+// but it's easy to write and easy to understand.
+const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
+class TokenizerTest : public testing::Test {
+ protected:
+  // For easy testing.
+  uint64 ParseInteger(const string& text) {
+    uint64 result;
+    EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result));
+    return result;
+  }
+};
+// ===================================================================
+// These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
+//   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
+#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
+// In each test case, the entire input text should parse as a single token
+// of the given type.
+struct SimpleTokenCase {
+  string input;
+  Tokenizer::TokenType type;
+};
+inline ostream& operator<<(ostream& out,
+                           const SimpleTokenCase& test_case) {
+  return out << CEscape(test_case.input);
+}
+SimpleTokenCase kSimpleTokenCases[] = {
+  // Test identifiers.
+  { "hello",       Tokenizer::TYPE_IDENTIFIER },
+  // Test integers.
+  { "123",         Tokenizer::TYPE_INTEGER },
+  { "0xab6",       Tokenizer::TYPE_INTEGER },
+  { "0XAB6",       Tokenizer::TYPE_INTEGER },
+  { "0X1234567",   Tokenizer::TYPE_INTEGER },
+  { "0x89abcdef",  Tokenizer::TYPE_INTEGER },
+  { "0x89ABCDEF",  Tokenizer::TYPE_INTEGER },
+  { "01234567",    Tokenizer::TYPE_INTEGER },
+  // Test floats.
+  { "123.45",      Tokenizer::TYPE_FLOAT },
+  { "1.",          Tokenizer::TYPE_FLOAT },
+  { "1e3",         Tokenizer::TYPE_FLOAT },
+  { "1E3",         Tokenizer::TYPE_FLOAT },
+  { "1e-3",        Tokenizer::TYPE_FLOAT },
+  { "1e+3",        Tokenizer::TYPE_FLOAT },
+  { "1.e3",        Tokenizer::TYPE_FLOAT },
+  { "1.2e3",       Tokenizer::TYPE_FLOAT },
+  { ".1",          Tokenizer::TYPE_FLOAT },
+  { ".1e3",        Tokenizer::TYPE_FLOAT },
+  { ".1e-3",       Tokenizer::TYPE_FLOAT },
+  { ".1e+3",       Tokenizer::TYPE_FLOAT },
+  // Test strings.
+  { "'hello'",     Tokenizer::TYPE_STRING },
+  { "\"foo\"",     Tokenizer::TYPE_STRING },
+  { "'a\"b'",      Tokenizer::TYPE_STRING },
+  { "\"a'b\"",     Tokenizer::TYPE_STRING },
+  { "'a\\'b'",     Tokenizer::TYPE_STRING },
+  { "\"a\\\"b\"",  Tokenizer::TYPE_STRING },
+  { "'\\xf'",      Tokenizer::TYPE_STRING },
+  { "'\\0'",       Tokenizer::TYPE_STRING },
+  // Test symbols.
+  { "+",           Tokenizer::TYPE_SYMBOL },
+  { ".",           Tokenizer::TYPE_SYMBOL },
+};
+TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
+  // Set up the tokenizer.
+  TestInputStream input(kSimpleTokenCases_case.input.data(),
+                        kSimpleTokenCases_case.input.size(),
+                        kBlockSizes_case);
+  TestErrorCollector error_collector;
+  Tokenizer tokenizer(&input, &error_collector);
+  // Before Next() is called, the initial token should always be TYPE_START.
+  EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
+  EXPECT_EQ("", tokenizer.current().text);
+  EXPECT_EQ(0, tokenizer.current().line);
+  EXPECT_EQ(0, tokenizer.current().column);
+  EXPECT_EQ(0, tokenizer.current().end_column);
+  // Parse the token.
+  ASSERT_TRUE(tokenizer.Next());
+  // Check that it has the right type.
+  EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type);
+  // Check that it contains the complete input text.
+  EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text);
+  // Check that it is located at the beginning of the input
+  EXPECT_EQ(0, tokenizer.current().line);
+  EXPECT_EQ(0, tokenizer.current().column);
+  EXPECT_EQ(kSimpleTokenCases_case.input.size(),
+            tokenizer.current().end_column);
+  // There should be no more input.
+  EXPECT_FALSE(tokenizer.Next());
+  // After Next() returns false, the token should have type TYPE_END.
+  EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type);
+  EXPECT_EQ("", tokenizer.current().text);
+  EXPECT_EQ(0, tokenizer.current().line);
+  EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column);
+  EXPECT_EQ(kSimpleTokenCases_case.input.size(),
+            tokenizer.current().end_column);
+  // There should be no errors.
+  EXPECT_TRUE(error_collector.text_.empty());
+}
+TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
+  // Test the "allow_f_after_float" option.
+  // Set up the tokenizer.
+  const char* text = "1f 2.5f 6e3f 7F";
+  TestInputStream input(text, strlen(text), kBlockSizes_case);
+  TestErrorCollector error_collector;
+  Tokenizer tokenizer(&input, &error_collector);
+  tokenizer.set_allow_f_after_float(true);
+  // Advance through tokens and check that they are parsed as expected.
+  ASSERT_TRUE(tokenizer.Next());
+  EXPECT_EQ(tokenizer.current().text, "1f");
+  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
+  ASSERT_TRUE(tokenizer.Next());
+  EXPECT_EQ(tokenizer.current().text, "2.5f");
+  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
+  ASSERT_TRUE(tokenizer.Next());
+  EXPECT_EQ(tokenizer.current().text, "6e3f");
+  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
+  ASSERT_TRUE(tokenizer.Next());
+  EXPECT_EQ(tokenizer.current().text, "7F");
+  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
+  // There should be no more input.
+  EXPECT_FALSE(tokenizer.Next());
+  // There should be no errors.
+  EXPECT_TRUE(error_collector.text_.empty());
+}
+#endif
+// -------------------------------------------------------------------
+// In each case, the input is parsed to produce a list of tokens.  The
+// last token in "output" must have type TYPE_END.
+struct MultiTokenCase {
+  string input;
+  Tokenizer::Token output[10];  // The compiler wants a constant array
+                                // size for initialization to work.  There
+                                // is no reason this can't be increased if
+                                // needed.
+};
+inline ostream& operator<<(ostream& out,
+                           const MultiTokenCase& test_case) {
+  return out << CEscape(test_case.input);
+}
+MultiTokenCase kMultiTokenCases[] = {
+  // Test empty input.
+  { "", {
+    { Tokenizer::TYPE_END       , ""     , 0,  0 },
+  }},
+  // Test all token types at the same time.
+  { "foo 1 1.2 + 'bar'", {
+    { Tokenizer::TYPE_IDENTIFIER, "foo"  , 0,  0,  3 },
+    { Tokenizer::TYPE_INTEGER   , "1"    , 0,  4,  5 },
+    { Tokenizer::TYPE_FLOAT     , "1.2"  , 0,  6,  9 },
+    { Tokenizer::TYPE_SYMBOL    , "+"    , 0, 10, 11 },
+    { Tokenizer::TYPE_STRING    , "'bar'", 0, 12, 17 },
+    { Tokenizer::TYPE_END       , ""     , 0, 17, 17 },
+  }},
+  // Test that consecutive symbols are parsed as separate tokens.
+  { "!@+%", {
+    { Tokenizer::TYPE_SYMBOL    , "!"    , 0, 0, 1 },
+    { Tokenizer::TYPE_SYMBOL    , "@"    , 0, 1, 2 },
+    { Tokenizer::TYPE_SYMBOL    , "+"    , 0, 2, 3 },
+    { Tokenizer::TYPE_SYMBOL    , "%"    , 0, 3, 4 },
+    { Tokenizer::TYPE_END       , ""     , 0, 4, 4 },
+  }},
+  // Test that newlines affect line numbers correctly.
+  { "foo bar\nrab oof", {
+    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0, 3 },
+    { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  4, 7 },
+    { Tokenizer::TYPE_IDENTIFIER, "rab", 1,  0, 3 },
+    { Tokenizer::TYPE_IDENTIFIER, "oof", 1,  4, 7 },
+    { Tokenizer::TYPE_END       , ""   , 1,  7, 7 },
+  }},
+  // Test that tabs affect column numbers correctly.
+  { "foo\tbar  \tbaz", {
+    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
+    { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  8, 11 },
+    { Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16, 19 },
+    { Tokenizer::TYPE_END       , ""   , 0, 19, 19 },
+  }},
+  // Test that tabs in string literals affect column numbers correctly.
+  { "\"foo\tbar\" baz", {
+    { Tokenizer::TYPE_STRING    , "\"foo\tbar\"", 0,  0, 12 },
+    { Tokenizer::TYPE_IDENTIFIER, "baz"         , 0, 13, 16 },
+    { Tokenizer::TYPE_END       , ""            , 0, 16, 16 },
+  }},
+  // Test that line comments are ignored.
+  { "foo // This is a comment\n"
+    "bar // This is another comment", {
+    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
+    { Tokenizer::TYPE_IDENTIFIER, "bar", 1,  0,  3 },
+    { Tokenizer::TYPE_END       , ""   , 1, 30, 30 },
+  }},
+  // Test that block comments are ignored.
+  { "foo /* This is a block comment */ bar", {
+    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
+    { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34, 37 },
+    { Tokenizer::TYPE_END       , ""   , 0, 37, 37 },
+  }},
+  // Test that sh-style comments are not ignored by default.
+  { "foo # bar\n"
+    "baz", {
+    { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
+    { Tokenizer::TYPE_SYMBOL    , "#"  , 0, 4, 5 },
+    { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6, 9 },
+    { Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0, 3 },
+    { Tokenizer::TYPE_END       , ""   , 1, 3, 3 },
+  }},
+  // Test all whitespace chars
+  { "foo\n\t\r\v\fbar", {
+    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
+    { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11, 14 },
+    { Tokenizer::TYPE_END       , ""   , 1, 14, 14 },
+  }},
+};
+TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
+  // Set up the tokenizer.
+  TestInputStream input(kMultiTokenCases_case.input.data(),
+                        kMultiTokenCases_case.input.size(),
+                        kBlockSizes_case);
+  TestErrorCollector error_collector;
+  Tokenizer tokenizer(&input, &error_collector);
+  // Before Next() is called, the initial token should always be TYPE_START.
+  EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
+  EXPECT_EQ("", tokenizer.current().text);
+  EXPECT_EQ(0, tokenizer.current().line);
+  EXPECT_EQ(0, tokenizer.current().column);
+  EXPECT_EQ(0, tokenizer.current().end_column);
+  // Loop through all expected tokens.
+  int i = 0;
+  Tokenizer::Token token;
+  do {
+    token = kMultiTokenCases_case.output[i++];
+    SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
+    Tokenizer::Token previous = tokenizer.current();
+    // Next() should only return false when it hits the end token.
+    if (token.type != Tokenizer::TYPE_END) {
+      ASSERT_TRUE(tokenizer.Next());
+    } else {
+      ASSERT_FALSE(tokenizer.Next());
+    }
+    // Check that the previous token is set correctly.
+    EXPECT_EQ(previous.type, tokenizer.previous().type);
+    EXPECT_EQ(previous.text, tokenizer.previous().text);
+    EXPECT_EQ(previous.line, tokenizer.previous().line);
+    EXPECT_EQ(previous.column, tokenizer.previous().column);
+    EXPECT_EQ(previous.end_column, tokenizer.previous().end_column);
+    // Check that the token matches the expected one.
+    EXPECT_EQ(token.type, tokenizer.current().type);
+    EXPECT_EQ(token.text, tokenizer.current().text);
+    EXPECT_EQ(token.line, tokenizer.current().line);
+    EXPECT_EQ(token.column, tokenizer.current().column);
+    EXPECT_EQ(token.end_column, tokenizer.current().end_column);
+  } while (token.type != Tokenizer::TYPE_END);
+  // There should be no errors.
+  EXPECT_TRUE(error_collector.text_.empty());
+}
+// This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
+//   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
+#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
+TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
+  // Test the "comment_style" option.
+  const char* text = "foo # bar\n"
+                     "baz // qux\n"
+                     "corge /* grault */\n"
+                     "garply";
+  const char* const kTokens[] = {"foo",  // "# bar" is ignored
+                                 "baz", "/", "/", "qux",
+                                 "corge", "/", "*", "grault", "*", "/",
+                                 "garply"};
+  // Set up the tokenizer.
+  TestInputStream input(text, strlen(text), kBlockSizes_case);
+  TestErrorCollector error_collector;
+  Tokenizer tokenizer(&input, &error_collector);
+  tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE);
+  // Advance through tokens and check that they are parsed as expected.
+  for (int i = 0; i < GOOGLE_ARRAYSIZE(kTokens); i++) {
+    EXPECT_TRUE(tokenizer.Next());
+    EXPECT_EQ(tokenizer.current().text, kTokens[i]);
+  }
+  // There should be no more input.
+  EXPECT_FALSE(tokenizer.Next());
+  // There should be no errors.
+  EXPECT_TRUE(error_collector.text_.empty());
+}
+#endif
+// -------------------------------------------------------------------
+// In each case, the input is expected to have two tokens named "prev" and
+// "next" with comments in between.
+struct DocCommentCase {
+  string input;
+  const char* prev_trailing_comments;
+  const char* detached_comments[10];
+  const char* next_leading_comments;
+};
+inline ostream& operator<<(ostream& out,
+                           const DocCommentCase& test_case) {
+  return out << CEscape(test_case.input);
+}
+DocCommentCase kDocCommentCases[] = {
+  {
+    "prev next",
+    "",
+    {},
+    ""
+      },
+        {
+      "prev /* ignored */ next",
+      "",
+      {},
+      ""
+        },
+          {
+        "prev // trailing comment\n"
+            "next",
+            " trailing comment\n",
+            {},
+            ""
+          },
+            {
+          "prev\n"
+              "// leading comment\n"
+              "// line 2\n"
+              "next",
+              "",
+              {},
+              " leading comment\n"
+              " line 2\n"
+            },
+              {
+            "prev\n"
+                "// trailing comment\n"
+                "// line 2\n"
+                "\n"
+                "next",
+                " trailing comment\n"
+                " line 2\n",
+                {},
+                ""
+              },
+                {
+              "prev // trailing comment\n"
+                  "// leading comment\n"
+                  "// line 2\n"
+                  "next",
+                  " trailing comment\n",
+                  {},
+                  " leading comment\n"
+                  " line 2\n"
+                },
+                  {
+                "prev /* trailing block comment */\n"
+                    "/* leading block comment\n"
+                    " * line 2\n"
+                    " * line 3 */"
+                    "next",
+                    " trailing block comment ",
+                    {},
+                    " leading block comment\n"
+                    " line 2\n"
+                    " line 3 "
+                  },
+                    {
+                  "prev\n"
+                      "/* trailing block comment\n"
+                      " * line 2\n"
+                      " * line 3\n"
+                      " */\n"
+                      "/* leading block comment\n"
+                      " * line 2\n"
+                      " * line 3 */"
+                      "next",
+                      " trailing block comment\n"
+                      " line 2\n"
+                      " line 3\n",
+                      {},
+                      " leading block comment\n"
+                      " line 2\n"
+                      " line 3 "
+                    },
+                      {
+                    "prev\n"
+                        "// trailing comment\n"
+                        "\n"
+                        "// detached comment\n"
+                        "// line 2\n"
+                        "\n"
+                        "// second detached comment\n"
+                        "/* third detached comment\n"
+                        " * line 2 */\n"
+                        "// leading comment\n"
+                        "next",
+                        " trailing comment\n",
+                        {
+                      " detached comment\n"
+                          " line 2\n",
+                          " second detached comment\n",
+                          " third detached comment\n"
+                          " line 2 "
+                        },
+                          " leading comment\n"
+                        },
+                          {
+                        "prev /**/\n"
+                            "\n"
+                            "// detached comment\n"
+                            "\n"
+                            "// leading comment\n"
+                            "next",
+                            "",
+                            {
+                          " detached comment\n"
+                            },
+                              " leading comment\n"
+                            },
+                              {
+                            "prev /**/\n"
+                                "// leading comment\n"
+                                "next",
+                                "",
+                                {},
+                                " leading comment\n"
+                              },
+                              };
+TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) {
+  // Set up the tokenizer.
+  TestInputStream input(kDocCommentCases_case.input.data(),
+                        kDocCommentCases_case.input.size(),
+                        kBlockSizes_case);
+  TestErrorCollector error_collector;
+  Tokenizer tokenizer(&input, &error_collector);
+  // Set up a second tokenizer where we'll pass all NULLs to NextWithComments().
+  TestInputStream input2(kDocCommentCases_case.input.data(),
+                        kDocCommentCases_case.input.size(),
+                        kBlockSizes_case);
+  Tokenizer tokenizer2(&input2, &error_collector);
+  tokenizer.Next();
+  tokenizer2.Next();
+  EXPECT_EQ("prev", tokenizer.current().text);
+  EXPECT_EQ("prev", tokenizer2.current().text);
+  string prev_trailing_comments;
+  vector<string> detached_comments;
+  string next_leading_comments;
+  tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments,
+                             &next_leading_comments);
+  tokenizer2.NextWithComments(NULL, NULL, NULL);
+  EXPECT_EQ("next", tokenizer.current().text);
+  EXPECT_EQ("next", tokenizer2.current().text);
+  EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments,
+            prev_trailing_comments);
+  for (int i = 0; i < detached_comments.size(); i++) {
+    ASSERT_LT(i, GOOGLE_ARRAYSIZE(kDocCommentCases));
+    ASSERT_TRUE(kDocCommentCases_case.detached_comments[i] != NULL);
+    EXPECT_EQ(kDocCommentCases_case.detached_comments[i],
+              detached_comments[i]);
+  }
+  // Verify that we matched all the detached comments.
+  EXPECT_EQ(NULL,
+      kDocCommentCases_case.detached_comments[detached_comments.size()]);
+  EXPECT_EQ(kDocCommentCases_case.next_leading_comments,
+            next_leading_comments);
+}
+// -------------------------------------------------------------------
+// Test parse helpers.  It's not really worth setting up a full data-driven
+// test here.
+TEST_F(TokenizerTest, ParseInteger) {
+  EXPECT_EQ(0, ParseInteger("0"));
+  EXPECT_EQ(123, ParseInteger("123"));
+  EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
+  EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
+  EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF"));
+  EXPECT_EQ(01234567, ParseInteger("01234567"));
+  EXPECT_EQ(0X123, ParseInteger("0X123"));
+  // Test invalid integers that may still be tokenized as integers.
+  EXPECT_EQ(0, ParseInteger("0x"));
+  uint64 i;
+#ifdef PROTOBUF_HAS_DEATH_TEST  // death tests do not work on Windows yet
+  // Test invalid integers that will never be tokenized as integers.
+  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("zxy", kuint64max, &i),
+    "passed text that could not have been tokenized as an integer");
+  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("1.2", kuint64max, &i),
+    "passed text that could not have been tokenized as an integer");
+  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("08", kuint64max, &i),
+    "passed text that could not have been tokenized as an integer");
+  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("0xg", kuint64max, &i),
+    "passed text that could not have been tokenized as an integer");
+  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("-1", kuint64max, &i),
+    "passed text that could not have been tokenized as an integer");
+#endif  // PROTOBUF_HAS_DEATH_TEST
+  // Test overflows.
+  EXPECT_TRUE (Tokenizer::ParseInteger("0", 0, &i));
+  EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i));
+  EXPECT_TRUE (Tokenizer::ParseInteger("1", 1, &i));
+  EXPECT_TRUE (Tokenizer::ParseInteger("12345", 12345, &i));
+  EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i));
+  EXPECT_TRUE (Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF" , kuint64max, &i));
+  EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i));
+}
+TEST_F(TokenizerTest, ParseFloat) {
+  EXPECT_DOUBLE_EQ(1    , Tokenizer::ParseFloat("1."));
+  EXPECT_DOUBLE_EQ(1e3  , Tokenizer::ParseFloat("1e3"));
+  EXPECT_DOUBLE_EQ(1e3  , Tokenizer::ParseFloat("1E3"));
+  EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3"));
+  EXPECT_DOUBLE_EQ(.1   , Tokenizer::ParseFloat(".1"));
+  EXPECT_DOUBLE_EQ(.25  , Tokenizer::ParseFloat(".25"));
+  EXPECT_DOUBLE_EQ(.1e3 , Tokenizer::ParseFloat(".1e3"));
+  EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3"));
+  EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3"));
+  EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3"));
+  EXPECT_DOUBLE_EQ(5    , Tokenizer::ParseFloat("5"));
+  EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12"));
+  EXPECT_DOUBLE_EQ(1.2  , Tokenizer::ParseFloat("1.2"));
+  EXPECT_DOUBLE_EQ(1.e2 , Tokenizer::ParseFloat("1.e2"));
+  // Test invalid integers that may still be tokenized as integers.
+  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e"));
+  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-"));
+  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e"));
+  // Test 'f' suffix.
+  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f"));
+  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f"));
+  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F"));
+  // These should parse successfully even though they are out of range.
+  // Overflows become infinity and underflows become zero.
+  EXPECT_EQ(     0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999"));
+  EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999"));
+#ifdef PROTOBUF_HAS_DEATH_TEST  // death tests do not work on Windows yet
+  // Test invalid integers that will never be tokenized as integers.
+  EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("zxy"),
+    "passed text that could not have been tokenized as a float");
+  EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("1-e0"),
+    "passed text that could not have been tokenized as a float");
+  EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("-1.0"),
+    "passed text that could not have been tokenized as a float");
+#endif  // PROTOBUF_HAS_DEATH_TEST
+}
+TEST_F(TokenizerTest, ParseString) {
+  string output;
+  Tokenizer::ParseString("'hello'", &output);
+  EXPECT_EQ("hello", output);
+  Tokenizer::ParseString("\"blah\\nblah2\"", &output);
+  EXPECT_EQ("blah\nblah2", output);
+  Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output);
+  EXPECT_EQ("\1x\1\123\739\52\334n\3", output);
+  Tokenizer::ParseString("'\\x20\\x4'", &output);
+  EXPECT_EQ("\x20\x4", output);
+  // Test invalid strings that may still be tokenized as strings.
+  Tokenizer::ParseString("\"\\a\\l\\v\\t", &output);  // \l is invalid
+  EXPECT_EQ("\a?\v\t", output);
+  Tokenizer::ParseString("'", &output);
+  EXPECT_EQ("", output);
+  Tokenizer::ParseString("'\\", &output);
+  EXPECT_EQ("\\", output);
+  // Experiment with Unicode escapes. Here are one-, two- and three-byte Unicode
+  // characters.
+  Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\U00024b62XX'", &output);
+  EXPECT_EQ("$¢€𤭢XX", output);
+  // Same thing encoded using UTF16.
+  Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", &output);
+  EXPECT_EQ("$¢€𤭢XX", output);
+  // Here's some broken UTF16; there's a head surrogate with no tail surrogate.
+  // We just output this as if it were UTF8; it's not a defined code point, but
+  // it has a defined encoding.
+  Tokenizer::ParseString("'\\ud852XX'", &output);
+  EXPECT_EQ("\xed\xa1\x92XX", output);
+  // Malformed escape: Demons may fly out of the nose.
+  Tokenizer::ParseString("\\u0", &output);
+  EXPECT_EQ("u0", output);
+  // Test invalid strings that will never be tokenized as strings.
+#ifdef PROTOBUF_HAS_DEATH_TEST  // death tests do not work on Windows yet
+  EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output),
+    "passed text that could not have been tokenized as a string");
+#endif  // PROTOBUF_HAS_DEATH_TEST
+}
+TEST_F(TokenizerTest, ParseStringAppend) {
+  // Check that ParseString and ParseStringAppend differ.
+  string output("stuff+");
+  Tokenizer::ParseStringAppend("'hello'", &output);
+  EXPECT_EQ("stuff+hello", output);
+  Tokenizer::ParseString("'hello'", &output);
+  EXPECT_EQ("hello", output);
+}
+// -------------------------------------------------------------------
+// Each case parses some input text, ignoring the tokens produced, and
+// checks that the error output matches what is expected.
+struct ErrorCase {
+  string input;
+  bool recoverable;  // True if the tokenizer should be able to recover and
+                     // parse more tokens after seeing this error.  Cases
+                     // for which this is true must end with "foo" as
+                     // the last token, which the test will check for.
+  const char* errors;
+};
+inline ostream& operator<<(ostream& out,
+                           const ErrorCase& test_case) {
+  return out << CEscape(test_case.input);
+}
+ErrorCase kErrorCases[] = {
+  // String errors.
+  { "'\\l' foo", true,
+    "0:2: Invalid escape sequence in string literal.\n" },
+  { "'\\x' foo", true,
+    "0:3: Expected hex digits for escape sequence.\n" },
+  { "'foo", false,
+    "0:4: Unexpected end of string.\n" },
+  { "'bar\nfoo", true,
+    "0:4: String literals cannot cross line boundaries.\n" },
+  { "'\\u01' foo", true,
+    "0:5: Expected four hex digits for \\u escape sequence.\n" },
+  { "'\\u01' foo", true,
+    "0:5: Expected four hex digits for \\u escape sequence.\n" },
+  { "'\\uXYZ' foo", true,
+    "0:3: Expected four hex digits for \\u escape sequence.\n" },
+  // Integer errors.
+  { "123foo", true,
+    "0:3: Need space between number and identifier.\n" },
+  // Hex/octal errors.
+  { "0x foo", true,
+    "0:2: \"0x\" must be followed by hex digits.\n" },
+  { "0541823 foo", true,
+    "0:4: Numbers starting with leading zero must be in octal.\n" },
+  { "0x123z foo", true,
+    "0:5: Need space between number and identifier.\n" },
+  { "0x123.4 foo", true,
+    "0:5: Hex and octal numbers must be integers.\n" },
+  { "0123.4 foo", true,
+    "0:4: Hex and octal numbers must be integers.\n" },
+  // Float errors.
+  { "1e foo", true,
+    "0:2: \"e\" must be followed by exponent.\n" },
+  { "1e- foo", true,
+    "0:3: \"e\" must be followed by exponent.\n" },
+  { "1.2.3 foo", true,
+    "0:3: Already saw decimal point or exponent; can't have another one.\n" },
+  { "1e2.3 foo", true,
+    "0:3: Already saw decimal point or exponent; can't have another one.\n" },
+  { "a.1 foo", true,
+    "0:1: Need space between identifier and decimal point.\n" },
+  // allow_f_after_float not enabled, so this should be an error.
+  { "1.0f foo", true,
+    "0:3: Need space between number and identifier.\n" },
+  // Block comment errors.
+  { "/*", false,
+    "0:2: End-of-file inside block comment.\n"
+    "0:0:   Comment started here.\n"},
+  { "/*/*/ foo", true,
+    "0:3: \"/*\" inside block comment.  Block comments cannot be nested.\n"},
+  // Control characters.  Multiple consecutive control characters should only
+  // produce one error.
+  { "\b foo", true,
+    "0:0: Invalid control characters encountered in text.\n" },
+  { "\b\b foo", true,
+    "0:0: Invalid control characters encountered in text.\n" },
+  // Check that control characters at end of input don't result in an
+  // infinite loop.
+  { "\b", false,
+    "0:0: Invalid control characters encountered in text.\n" },
+  // Check recovery from '\0'.  We have to explicitly specify the length of
+  // these strings because otherwise the string constructor will just call
+  // strlen() which will see the first '\0' and think that is the end of the
+  // string.
+  { string("\0foo", 4), true,
+    "0:0: Invalid control characters encountered in text.\n" },
+  { string("\0\0foo", 5), true,
+    "0:0: Invalid control characters encountered in text.\n" },
+  // Check error from high order bits set
+  { "\300foo", true,
+    "0:0: Interpreting non ascii codepoint 192.\n" },
+};
+TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
+  // Set up the tokenizer.
+  TestInputStream input(kErrorCases_case.input.data(),
+                        kErrorCases_case.input.size(),
+                        kBlockSizes_case);
+  TestErrorCollector error_collector;
+  Tokenizer tokenizer(&input, &error_collector);
+  // Ignore all input, except remember if the last token was "foo".
+  bool last_was_foo = false;
+  while (tokenizer.Next()) {
+    last_was_foo = tokenizer.current().text == "foo";
+  }
+  // Check that the errors match what was expected.
+  EXPECT_EQ(kErrorCases_case.errors, error_collector.text_);
+  // If the error was recoverable, make sure we saw "foo" after it.
+  if (kErrorCases_case.recoverable) {
+    EXPECT_TRUE(last_was_foo);
+  }
+}
+// -------------------------------------------------------------------
+TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
+  string text = "foo bar";
+  TestInputStream input(text.data(), text.size(), kBlockSizes_case);
+  // Create a tokenizer, read one token, then destroy it.
+  {
+    TestErrorCollector error_collector;
+    Tokenizer tokenizer(&input, &error_collector);
+    tokenizer.Next();
+  }
+  // Only "foo" should have been read.
+  EXPECT_EQ(strlen("foo"), input.ByteCount());
+}
+}  // namespace
+}  // namespace io
+}  // namespace protobuf
+}  // namespace google