wiki-api 0.0.2 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/README.md +64 -33
- data/lib/wiki/api/connect.rb +21 -7
- data/lib/wiki/api/page.rb +23 -50
- data/lib/wiki/api/page_block.rb +6 -4
- data/lib/wiki/api/page_headline.rb +97 -2
- data/lib/wiki/api/page_link.rb +9 -4
- data/lib/wiki/api/page_list_item.rb +4 -2
- data/lib/wiki/api/util.rb +12 -1
- data/lib/wiki/api/version.rb +1 -1
- data/test/unit/files/Wiktionary_program.html +4232 -0
- data/test/unit/wiki_page_offline.rb +262 -0
- data/wiki-api.gemspec +2 -2
- metadata +8 -8
- data/test/unit/wiki_page_config.rb +0 -45
- data/test/unit/wiki_page_object.rb +0 -229
    
        checksums.yaml
    CHANGED
    
    | @@ -1,15 +1,15 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA1:
         | 
| 3 3 | 
             
              metadata.gz: !binary |-
         | 
| 4 | 
            -
                 | 
| 4 | 
            +
                NjQ3MjZkMDdmNTg2YjdhZDRmM2E3MjU4ZjA1Y2IwOGYzODEwZTFkMA==
         | 
| 5 5 | 
             
              data.tar.gz: !binary |-
         | 
| 6 | 
            -
                 | 
| 6 | 
            +
                YWE4Mzc4ZjRlYTBjNGE4MTkyYmE0OGFkOTJkMDViZTI0MjQ5MGFiMw==
         | 
| 7 7 | 
             
            SHA512:
         | 
| 8 8 | 
             
              metadata.gz: !binary |-
         | 
| 9 | 
            -
                 | 
| 10 | 
            -
                 | 
| 11 | 
            -
                 | 
| 9 | 
            +
                OTNhMTZkNjMwNzJiMzU5YWE0ZDZiNzRlZWU5ZDJjM2Q1NTA5ZWRiN2IzY2Mw
         | 
| 10 | 
            +
                MmU1ZDk0ODZhN2U4ODYwNjY0ZjdmY2U5ZTFkMDk4ZDA2MzIyODUzNjE0YzVl
         | 
| 11 | 
            +
                OGE2ZmFmOTYyOWY2MWIyNGNlNmU5NjYwOTNkMGNhNjllOWM0YzQ=
         | 
| 12 12 | 
             
              data.tar.gz: !binary |-
         | 
| 13 | 
            -
                 | 
| 14 | 
            -
                 | 
| 15 | 
            -
                 | 
| 13 | 
            +
                YjgzZGEzYzhhOWFmNzZhMjRlMWFiYmJiY2Q3N2EwOGQwZTBjY2Q0NzYxNWE2
         | 
| 14 | 
            +
                ODc5NmMyNmYyODMyNmVmMjFmYzhhOTAzMTUzZTBmODU2OTMwY2RhYjg0Mjkz
         | 
| 15 | 
            +
                Yjk3NjMzNGFlZGViYzQyOGQ5YzVjM2MzMjIyNWVlOWRhOTU0MDk=
         | 
    
        data/README.md
    CHANGED
    
    | @@ -1,13 +1,17 @@ | |
| 1 1 | 
             
            # Wiki::Api
         | 
| 2 2 |  | 
| 3 | 
            -
            Wiki API is a gem (Ruby on Rails) that interfaces with the MediaWiki API (https://www.mediawiki.org/wiki/API:Main_page). This gem is more than a interface, it has abstract classes  | 
| 3 | 
            +
            Wiki API is a gem (Ruby on Rails) that interfaces with the MediaWiki API (https://www.mediawiki.org/wiki/API:Main_page). This gem is more than a interface, it has abstract classes for Page and Headline parsing. You're able to iterate through these headlines, and access data accordingly. 
         | 
| 4 4 |  | 
| 5 | 
            -
            NOTE: nokogiri  | 
| 5 | 
            +
            NOTE: This gem has a nokogiri (http://nokogiri.org/Nokogiri.html) backend (for HTML parsing). Major components: Page, Headline, Block, ListItem, and Link are wrappers for easy data access, however it's still possible to retreive the raw HTML within these objects.
         | 
| 6 6 |  | 
| 7 7 | 
             
            Requests to the MediaWiki API use the following URI structure:
         | 
| 8 8 |  | 
| 9 9 | 
             
                http(s)://somemediawiki.org/w/api.php?action=parse&format=json&page="anypage"
         | 
| 10 10 |  | 
| 11 | 
            +
            # RDoc (rdoc.info)
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                http://rdoc.info/github/dblommesteijn/wiki-api/frames/file/README.md
         | 
| 14 | 
            +
             | 
| 11 15 |  | 
| 12 16 | 
             
            ### Dependencies (production)
         | 
| 13 17 |  | 
| @@ -15,27 +19,27 @@ Requests to the MediaWiki API use the following URI structure: | |
| 15 19 | 
             
            * nokogiri
         | 
| 16 20 |  | 
| 17 21 |  | 
| 18 | 
            -
            ### Roadmap
         | 
| 22 | 
            +
            ### Feature Roadmap
         | 
| 19 23 |  | 
| 20 | 
            -
            * Version (0.0 | 
| 24 | 
            +
            * Version (0.1.0)
         | 
| 21 25 |  | 
| 22 | 
            -
               | 
| 26 | 
            +
              Major current release with several core changes.
         | 
| 23 27 |  | 
| 24 | 
            -
             | 
| 28 | 
            +
            * Version (0.1.1)
         | 
| 29 | 
            +
              
         | 
| 30 | 
            +
              No features determined yet (please drop me a line if you're interested in additions).
         | 
| 25 31 |  | 
| 26 32 |  | 
| 27 33 | 
             
            ### Changelog
         | 
| 28 34 |  | 
| 29 | 
            -
            * Version (0.0. | 
| 30 | 
            -
              
         | 
| 31 | 
            -
              Nested ListItems, Links (within Page)
         | 
| 35 | 
            +
            * Version (0.0.2) -> (current)
         | 
| 32 36 |  | 
| 33 | 
            -
               | 
| 37 | 
            +
              PageLink URI without global config Exception resolved
         | 
| 34 38 |  | 
| 39 | 
            +
              Reverse (parent) object lookup
         | 
| 35 40 |  | 
| 36 | 
            -
             | 
| 41 | 
            +
              Nested PageHeadline objects
         | 
| 37 42 |  | 
| 38 | 
            -
            None discovered thus far.
         | 
| 39 43 |  | 
| 40 44 |  | 
| 41 45 | 
             
            ## Installation
         | 
| @@ -71,13 +75,16 @@ Wiki::Api::Connect.config = CONFIG | |
| 71 75 |  | 
| 72 76 | 
             
            ## Usage
         | 
| 73 77 |  | 
| 74 | 
            -
            ### Query a Page
         | 
| 78 | 
            +
            ### Query a Page and Headline
         | 
| 75 79 |  | 
| 76 80 | 
             
            Requesting headlines from a given page.
         | 
| 77 81 |  | 
| 78 82 | 
             
            ```ruby
         | 
| 79 83 | 
             
            page = Wiki::Api::Page.new name: "Wiktionary:Welcome,_newcomers"
         | 
| 80 | 
            -
             | 
| 84 | 
            +
            # the root headline equals the pagename
         | 
| 85 | 
            +
            puts page.root_headline.name
         | 
| 86 | 
            +
            # iterate next level of headlines
         | 
| 87 | 
            +
            page.root_headline.headlines.each do |headline_name, headline|
         | 
| 81 88 | 
             
              # printing headline name (PageHeadline)
         | 
| 82 89 | 
             
              puts headline.name
         | 
| 83 90 | 
             
            end
         | 
| @@ -87,29 +94,28 @@ Getting headlines for a given name. | |
| 87 94 |  | 
| 88 95 | 
             
            ```ruby
         | 
| 89 96 | 
             
            page = Wiki::Api::Page.new name: "Wiktionary:Welcome,_newcomers"
         | 
| 90 | 
            -
             | 
| 91 | 
            -
             | 
| 92 | 
            -
             | 
| 93 | 
            -
             | 
| 97 | 
            +
            # lookup headline by name (underscore and case are ignored)
         | 
| 98 | 
            +
            headline = page.root_headline.headline("editing wiktionary").first
         | 
| 99 | 
            +
            # printing headline name (PageHeadline)
         | 
| 100 | 
            +
            puts headline.name
         | 
| 101 | 
            +
            # get the type of nested headline (html h1,2,3,4 etc.)
         | 
| 102 | 
            +
            puts headline.type
         | 
| 94 103 | 
             
            ```
         | 
| 95 104 |  | 
| 96 105 | 
             
            ### Basic Page structure
         | 
| 97 106 |  | 
| 98 107 | 
             
            ```ruby
         | 
| 99 108 | 
             
            page = Wiki::Api::Page.new name: "Wiktionary:Welcome,_newcomers"
         | 
| 100 | 
            -
             | 
| 101 109 | 
             
            # iterate PageHeadline objects
         | 
| 102 | 
            -
            page.headlines.each do |headline|
         | 
| 103 | 
            -
             | 
| 110 | 
            +
            page.root_headline.headlines.each do |headline_name, headline|
         | 
| 104 111 | 
             
              # exposing nokogiri internal elements
         | 
| 105 112 | 
             
              elements = headline.elements.flatten
         | 
| 106 113 | 
             
              elements.each do |element|
         | 
| 107 | 
            -
                #  | 
| 114 | 
            +
                # print will result in: Nokogiri::XML::Text or Nokogiri::XML::Element
         | 
| 115 | 
            +
                puts element.class
         | 
| 108 116 | 
             
              end
         | 
| 109 | 
            -
             | 
| 110 117 | 
             
              # string representation of all nested text
         | 
| 111 118 | 
             
              block.to_texts
         | 
| 112 | 
            -
             | 
| 113 119 | 
             
              # iterate PageListItem objects
         | 
| 114 120 | 
             
              block.list_items.each do |list_item|
         | 
| 115 121 | 
             
                # string representation of nested text
         | 
| @@ -136,7 +142,7 @@ end | |
| 136 142 | 
             
            ```
         | 
| 137 143 |  | 
| 138 144 |  | 
| 139 | 
            -
            ### Example using Global config (https://en.wikipedia.org/wiki/ | 
| 145 | 
            +
            ### Example using Global config (https://en.wikipedia.org/wiki/Ruby_on_Rails)
         | 
| 140 146 |  | 
| 141 147 | 
             
            This is a example of querying wikipedia.org on the page: "Ruby_on_rails", and printing the References headline links for each list item.
         | 
| 142 148 |  | 
| @@ -146,35 +152,32 @@ CONFIG = { uri: "https://en.wikipedia.org" } | |
| 146 152 | 
             
            Wiki::Api::Connect.config = CONFIG
         | 
| 147 153 |  | 
| 148 154 | 
             
            # querying the page
         | 
| 149 | 
            -
            page = Wiki::Api::Page.new name: " | 
| 155 | 
            +
            page = Wiki::Api::Page.new name: "Ruby_on_Rails"
         | 
| 150 156 |  | 
| 151 157 | 
             
            # get headlines with name Reference (there can be multiple headlines with the same name!)
         | 
| 152 | 
            -
            headlines = page.headline "References"
         | 
| 158 | 
            +
            headlines = page.root_headline.headline "References"
         | 
| 153 159 |  | 
| 154 160 | 
             
            # iterate headlines
         | 
| 155 161 | 
             
            headlines.each do |headline|
         | 
| 156 162 | 
             
              # iterate list items on the given headline
         | 
| 157 163 | 
             
              headline.block.list_items.each do |list_item|
         | 
| 158 | 
            -
             | 
| 159 164 | 
             
                # print the uri of all links
         | 
| 160 165 | 
             
                puts list_item.links.map{ |l| l.uri }
         | 
| 161 | 
            -
                
         | 
| 162 166 | 
             
              end
         | 
| 163 167 | 
             
            end
         | 
| 164 168 | 
             
            ```
         | 
| 165 169 |  | 
| 166 170 |  | 
| 167 | 
            -
             | 
| 168 | 
            -
            ### Example passing URI (https://en.wikipedia.org/wiki/Ruby_on_rails)
         | 
| 171 | 
            +
            ### Example passing URI (https://en.wikipedia.org/wiki/Ruby_on_Rails)
         | 
| 169 172 |  | 
| 170 173 | 
             
            This is the same example as the one above, except for setting a global config to direct the requests to a given URI.
         | 
| 171 174 |  | 
| 172 175 | 
             
            ```ruby
         | 
| 173 176 | 
             
            # querying the page
         | 
| 174 | 
            -
            page = Wiki::Api::Page.new name: " | 
| 177 | 
            +
            page = Wiki::Api::Page.new name: "Ruby_on_Rails", uri: "https://en.wikipedia.org"
         | 
| 175 178 |  | 
| 176 179 | 
             
            # get headlines with name Reference (there can be multiple headlines with the same name!)
         | 
| 177 | 
            -
            headlines = page.headline "References"
         | 
| 180 | 
            +
            headlines = page.root_headline.headline "References"
         | 
| 178 181 |  | 
| 179 182 | 
             
            # iterate headlines
         | 
| 180 183 | 
             
            headlines.each do |headline|
         | 
| @@ -189,4 +192,32 @@ end | |
| 189 192 | 
             
            ```
         | 
| 190 193 |  | 
| 191 194 |  | 
| 195 | 
            +
            ### Example searching headlines
         | 
| 196 | 
            +
             | 
| 197 | 
            +
            This example shows how the headlines can be searched. For more info check: https://github.com/dblommesteijn/wiki-api/blob/master/lib/wiki/api/page.rb#L97
         | 
| 198 | 
            +
             | 
| 199 | 
            +
             | 
| 200 | 
            +
            ```ruby
         | 
| 201 | 
            +
            # querying the page
         | 
| 202 | 
            +
            page = Wiki::Api::Page.new name: "Ruby_on_Rails", uri: "https://en.wikipedia.org"
         | 
| 203 | 
            +
             | 
| 204 | 
            +
            # NOTE: the following are all valid headline names:
         | 
| 205 | 
            +
            # request headline (by literal name)
         | 
| 206 | 
            +
            headlines = page.root_headline.headline "Philosophy_and_design"
         | 
| 207 | 
            +
            puts headlines.map{|h| h.name}
         | 
| 208 | 
            +
            # request headline (by downcase name)
         | 
| 209 | 
            +
            headlines = page.root_headline.headline "philosophy_and_design"
         | 
| 210 | 
            +
            puts headlines.map{|h| h.name}
         | 
| 211 | 
            +
            # request headline (by human name)
         | 
| 212 | 
            +
            headlines = page.root_headline.headline "philosophy and design"
         | 
| 213 | 
            +
            puts headlines.map{|h| h.name}
         | 
| 214 | 
            +
             | 
| 215 | 
            +
            # NOTE2: headlines are matched on headline.start_with?(requested_headline)
         | 
| 216 | 
            +
            # because of start_with? compare this should work as well!
         | 
| 217 | 
            +
            headlines = page.root_headline.headline "philosophy"
         | 
| 218 | 
            +
            puts headlines.map{|h| h.name}
         | 
| 219 | 
            +
            ```
         | 
| 220 | 
            +
             | 
| 221 | 
            +
             | 
| 222 | 
            +
             | 
| 192 223 |  | 
    
        data/lib/wiki/api/connect.rb
    CHANGED
    
    | @@ -7,12 +7,13 @@ module Wiki | |
| 7 7 |  | 
| 8 8 | 
             
                class Connect
         | 
| 9 9 |  | 
| 10 | 
            -
                  attr_accessor :uri, :api_path, :api_options, :http, :request, :response, :html, :parsed
         | 
| 10 | 
            +
                  attr_accessor :uri, :api_path, :api_options, :http, :request, :response, :html, :parsed, :file
         | 
| 11 11 |  | 
| 12 12 | 
             
                  def initialize(options={})
         | 
| 13 13 | 
             
                    @@config ||= nil
         | 
| 14 14 | 
             
                    options.merge! @@config unless @@config.nil?
         | 
| 15 15 | 
             
                    self.uri = options[:uri] if options.include? :uri
         | 
| 16 | 
            +
                    self.file = options[:file] if options.include? :file
         | 
| 16 17 | 
             
                    self.api_path = options[:api_path] if options.include? :api_path
         | 
| 17 18 | 
             
                    self.api_options = options[:api_options] if options.include? :api_options
         | 
| 18 19 |  | 
| @@ -38,12 +39,25 @@ module Wiki | |
| 38 39 |  | 
| 39 40 | 
             
                  def page page_name
         | 
| 40 41 | 
             
                    self.api_options[:page] = page_name
         | 
| 41 | 
            -
                     | 
| 42 | 
            -
                     | 
| 43 | 
            -
             | 
| 44 | 
            -
             | 
| 45 | 
            -
             | 
| 46 | 
            -
             | 
| 42 | 
            +
                    # parse page by uri
         | 
| 43 | 
            +
                    if !self.uri.nil? && self.file.nil?
         | 
| 44 | 
            +
                      self.connect
         | 
| 45 | 
            +
                      response = self.response
         | 
| 46 | 
            +
                      json = JSON.parse response.body, {symbolize_names: true}
         | 
| 47 | 
            +
                      raise json[:error][:code] unless valid? json, response
         | 
| 48 | 
            +
                      self.html = json[:parse][:text]
         | 
| 49 | 
            +
                      self.parsed = Nokogiri::HTML self.html[:*]
         | 
| 50 | 
            +
                    # parse page by file
         | 
| 51 | 
            +
                    elsif !self.file.nil?
         | 
| 52 | 
            +
                      f = File.open(self.file)
         | 
| 53 | 
            +
                      # self.parsed = Nokogiri::HTML self.html[:*]
         | 
| 54 | 
            +
                      self.parsed = Nokogiri::HTML(f)
         | 
| 55 | 
            +
                      f.close
         | 
| 56 | 
            +
                    # invalid config, raise exception
         | 
| 57 | 
            +
                    else
         | 
| 58 | 
            +
                      raise "no :uri or :file config found!"
         | 
| 59 | 
            +
                    end
         | 
| 60 | 
            +
                    self.parsed
         | 
| 47 61 | 
             
                  end
         | 
| 48 62 |  | 
| 49 63 | 
             
                  class << self
         | 
    
        data/lib/wiki/api/page.rb
    CHANGED
    
    | @@ -1,49 +1,34 @@ | |
| 1 1 | 
             
            module Wiki
         | 
| 2 2 | 
             
              module Api
         | 
| 3 3 |  | 
| 4 | 
            +
                # MediaWiki Page, collection of all html information plus it's page title
         | 
| 4 5 | 
             
                class Page
         | 
| 5 6 |  | 
| 6 | 
            -
                  attr_accessor :name, :parsed_page, :uri
         | 
| 7 | 
            +
                  attr_accessor :name, :parsed_page, :uri, :parent
         | 
| 7 8 |  | 
| 8 9 | 
             
                  def initialize(options={})
         | 
| 9 10 | 
             
                    self.name = options[:name] if options.include? :name
         | 
| 10 | 
            -
                    uri = options[:uri] if options.include? :uri
         | 
| 11 | 
            -
             | 
| 12 | 
            -
                    @@config ||= nil
         | 
| 13 | 
            -
                    if @@config.nil? || !uri.nil?
         | 
| 14 | 
            -
                      # use the connection to collect HTML pages for parsing
         | 
| 15 | 
            -
                      @connect = Wiki::Api::Connect.new uri: uri
         | 
| 16 | 
            -
                    else
         | 
| 17 | 
            -
                      # using a local HTML file for parsing
         | 
| 18 | 
            -
                    end
         | 
| 11 | 
            +
                    self.uri = options[:uri] if options.include? :uri
         | 
| 12 | 
            +
                    @connect = Wiki::Api::Connect.new uri: uri
         | 
| 19 13 | 
             
                  end
         | 
| 20 14 |  | 
| 21 | 
            -
                  def  | 
| 22 | 
            -
                     | 
| 23 | 
            -
                    self.parse_blocks.each do |headline_name, elements|
         | 
| 24 | 
            -
                      headline = PageHeadline.new name: headline_name
         | 
| 25 | 
            -
                      elements.each do |element|
         | 
| 26 | 
            -
                        # nokogiri element
         | 
| 27 | 
            -
                        headline.block << element
         | 
| 28 | 
            -
                      end
         | 
| 29 | 
            -
                      headlines << headline
         | 
| 30 | 
            -
                    end
         | 
| 31 | 
            -
                    headlines
         | 
| 15 | 
            +
                  def connect
         | 
| 16 | 
            +
                    @connect
         | 
| 32 17 | 
             
                  end
         | 
| 33 18 |  | 
| 34 | 
            -
             | 
| 35 | 
            -
             | 
| 36 | 
            -
             | 
| 37 | 
            -
             | 
| 38 | 
            -
                      elements.each do |element|
         | 
| 39 | 
            -
                        # nokogiri element
         | 
| 40 | 
            -
                        headline.block << element
         | 
| 41 | 
            -
                      end
         | 
| 42 | 
            -
                      headlines << headline
         | 
| 43 | 
            -
                    end
         | 
| 44 | 
            -
                    headlines
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                  # collect all headlines, keep original page formatting
         | 
| 21 | 
            +
                  def root_headline
         | 
| 22 | 
            +
                    self.parse_blocks
         | 
| 45 23 | 
             
                  end
         | 
| 46 24 |  | 
| 25 | 
            +
                  # # collect headlines by given name, this will flatten the nested headlines
         | 
| 26 | 
            +
                  # def flat_headlines_by_name headline_name
         | 
| 27 | 
            +
                  #   raise "not yet implemented!"
         | 
| 28 | 
            +
                  #   # TODO: implement flattening of headlines within the root headline
         | 
| 29 | 
            +
                  #   # ALT:  breath search option in the root of the first headline
         | 
| 30 | 
            +
                  #   self.parse_blocks(headline_name)
         | 
| 31 | 
            +
                  # end
         | 
| 47 32 |  | 
| 48 33 |  | 
| 49 34 | 
             
                  def to_html
         | 
| @@ -55,22 +40,8 @@ module Wiki | |
| 55 40 | 
             
                    self.parse_page = nil
         | 
| 56 41 | 
             
                  end
         | 
| 57 42 |  | 
| 58 | 
            -
                  class << self
         | 
| 59 | 
            -
                    def config=(config = {})
         | 
| 60 | 
            -
                      @@config = config
         | 
| 61 | 
            -
                    end
         | 
| 62 | 
            -
                  end
         | 
| 63 | 
            -
             | 
| 64 | 
            -
                  protected
         | 
| 65 | 
            -
             | 
| 66 43 | 
             
                  def load_page!
         | 
| 67 | 
            -
                     | 
| 68 | 
            -
                      self.parsed_page ||= @connect.page self.name
         | 
| 69 | 
            -
                    elsif self.parsed_page.nil?
         | 
| 70 | 
            -
                      f = File.open(@@config[:file])
         | 
| 71 | 
            -
                      self.parsed_page = Nokogiri::HTML(f)
         | 
| 72 | 
            -
                      f.close
         | 
| 73 | 
            -
                    end
         | 
| 44 | 
            +
                    self.parsed_page ||= @connect.page self.name
         | 
| 74 45 | 
             
                  end
         | 
| 75 46 |  | 
| 76 47 |  | 
| @@ -81,11 +52,12 @@ module Wiki | |
| 81 52 |  | 
| 82 53 | 
             
                    # get headline nodes by span class
         | 
| 83 54 | 
             
                    xs = self.parsed_page.xpath("//span[@class='mw-headline']")
         | 
| 55 | 
            +
             | 
| 84 56 | 
             
                    # filter single headline by name (ignore case)
         | 
| 85 57 | 
             
                    xs = self.filter_headline xs, headline_name unless headline_name.nil?
         | 
| 86 58 |  | 
| 87 59 | 
             
                    # NOTE: first_part has no id attribute and thus cannot be filtered or processed within xpath (xs)
         | 
| 88 | 
            -
                    if headline_name | 
| 60 | 
            +
                    if headline_name.nil? || headline_name.start_with?(self.name.downcase)
         | 
| 89 61 | 
             
                      x = self.first_part
         | 
| 90 62 | 
             
                      result[self.name] ||= [] 
         | 
| 91 63 | 
             
                      result[self.name] << (self.collect_elements(x.parent))
         | 
| @@ -95,11 +67,12 @@ module Wiki | |
| 95 67 | 
             
                    xs.each do |x|
         | 
| 96 68 | 
             
                      headline = x.attributes["id"].value
         | 
| 97 69 | 
             
                      elements = self.collect_elements x.parent.next
         | 
| 98 | 
            -
                      result[headline] ||= [] | 
| 70 | 
            +
                      result[headline] ||= []
         | 
| 99 71 | 
             
                      result[headline] << elements
         | 
| 100 72 | 
             
                    end
         | 
| 101 73 |  | 
| 102 | 
            -
                     | 
| 74 | 
            +
                    # create root object
         | 
| 75 | 
            +
                    PageHeadline.new parent: self, name: result.first[0], headlines: result, level: 0
         | 
| 103 76 | 
             
                  end
         | 
| 104 77 |  | 
| 105 78 | 
             
                  # harvest first part of the page (missing heading and class="mw-headline")
         | 
    
        data/lib/wiki/api/page_block.rb
    CHANGED
    
    | @@ -1,20 +1,22 @@ | |
| 1 1 | 
             
            module Wiki
         | 
| 2 2 | 
             
              module Api
         | 
| 3 3 |  | 
| 4 | 
            +
                # Collection of elements for segmented per headline
         | 
| 4 5 | 
             
                class PageBlock
         | 
| 5 6 |  | 
| 6 | 
            -
                  attr_accessor :elements
         | 
| 7 | 
            +
                  attr_accessor :elements, :parent
         | 
| 7 8 |  | 
| 8 9 | 
             
                  def initialize options={}
         | 
| 10 | 
            +
                    self.parent = options[:parent] if options.include? :parent
         | 
| 9 11 | 
             
                    self.elements = []
         | 
| 10 12 | 
             
                  end
         | 
| 11 13 |  | 
| 12 14 | 
             
                  def << value
         | 
| 15 | 
            +
                    # value.first.previous.name
         | 
| 13 16 | 
             
                    self.elements << value
         | 
| 14 17 | 
             
                  end
         | 
| 15 18 |  | 
| 16 19 | 
             
                  def to_texts
         | 
| 17 | 
            -
                    # TODO: perhaps we should wrap the elements with objects??
         | 
| 18 20 | 
             
                    texts = []
         | 
| 19 21 | 
             
                    self.elements.flatten.each do |element|
         | 
| 20 22 | 
             
                      text = Wiki::Api::Util.element_to_text element if element.is_a? Nokogiri::XML::Element
         | 
| @@ -28,14 +30,14 @@ module Wiki | |
| 28 30 | 
             
                  def list_items
         | 
| 29 31 | 
             
                    # TODO: perhaps we should wrap the elements with objects, and request a li per element??
         | 
| 30 32 | 
             
                    self.search("li").map do |list_item|
         | 
| 31 | 
            -
                      PageListItem.new element: list_item
         | 
| 33 | 
            +
                      PageListItem.new parent: self, element: list_item
         | 
| 32 34 | 
             
                    end
         | 
| 33 35 | 
             
                  end
         | 
| 34 36 |  | 
| 35 37 | 
             
                  def links
         | 
| 36 38 | 
             
                    # TODO: perhaps we should wrap the elements with objects, and request a li per element??
         | 
| 37 39 | 
             
                    self.search("a").map do |a|
         | 
| 38 | 
            -
                      PageLink.new element: a
         | 
| 40 | 
            +
                      PageLink.new parent: self, element: a
         | 
| 39 41 | 
             
                    end
         | 
| 40 42 | 
             
                  end
         | 
| 41 43 |  | 
| @@ -1,20 +1,115 @@ | |
| 1 1 | 
             
            module Wiki
         | 
| 2 2 | 
             
              module Api
         | 
| 3 3 |  | 
| 4 | 
            +
                # Headline for a page (class="mw-healine")
         | 
| 4 5 | 
             
                class PageHeadline
         | 
| 5 6 |  | 
| 6 | 
            -
                   | 
| 7 | 
            +
                  require 'json'
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                  LEVEL = ["text", "h1", "h2", "h3", "h4", "h5", "h6"]
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                  attr_accessor :name, :block, :parent, :headlines, :level
         | 
| 7 12 |  | 
| 8 13 | 
             
                  def initialize options={}
         | 
| 9 14 | 
             
                    self.name = options[:name] if options.include? :name
         | 
| 10 | 
            -
                    self. | 
| 15 | 
            +
                    self.parent = options[:parent] if options.include? :parent
         | 
| 16 | 
            +
                    self.level = options[:level] if options.include? :level
         | 
| 17 | 
            +
                    options[:headlines] ||= []
         | 
| 18 | 
            +
                    self.headlines ||= {}
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                    # store elements in a block
         | 
| 21 | 
            +
                    self.block = PageBlock.new parent: self
         | 
| 22 | 
            +
                    if options[:headlines].include? self.name
         | 
| 23 | 
            +
                      options[:headlines][self.name].each do |element|
         | 
| 24 | 
            +
                        self.block << element
         | 
| 25 | 
            +
                      end
         | 
| 26 | 
            +
                    end
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                    # collect nested headlines
         | 
| 29 | 
            +
                    headlines = options[:headlines]
         | 
| 30 | 
            +
                    # remove self from list
         | 
| 31 | 
            +
                    headlines.delete self.name
         | 
| 32 | 
            +
                    nested_headlines = self.nested_headlines headlines, self.name, self.level
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                    # iterate nested headlines, and call recursive
         | 
| 35 | 
            +
                    nested_headlines.each do |headline_name, value|
         | 
| 36 | 
            +
                      level = LEVEL.index value.first.first.previous.name
         | 
| 37 | 
            +
                      self.headlines[headline_name] = (PageHeadline.new parent: self, name: headline_name, headlines: headlines, level: level)
         | 
| 38 | 
            +
                    end
         | 
| 11 39 | 
             
                  end
         | 
| 12 40 |  | 
| 13 41 | 
             
                  def elements
         | 
| 14 42 | 
             
                    self.block.elements
         | 
| 15 43 | 
             
                  end
         | 
| 16 44 |  | 
| 45 | 
            +
                  def type
         | 
| 46 | 
            +
                    self.block.elements.first.first.previous.name
         | 
| 47 | 
            +
                  end
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                  # get headline by name
         | 
| 50 | 
            +
                  def headline name
         | 
| 51 | 
            +
                    name = name.downcase.gsub(" ", "_")
         | 
| 52 | 
            +
                    self.headlines.reject do |k,v| 
         | 
| 53 | 
            +
                      !k.downcase.start_with?(name)
         | 
| 54 | 
            +
                    end.values()
         | 
| 55 | 
            +
                  end
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                  # recursive headline search
         | 
| 58 | 
            +
                  # def headline_by_name name, depth = 1
         | 
| 59 | 
            +
                  #   name = name.downcase.gsub(" ", "_")
         | 
| 60 | 
            +
                  #   ret = []
         | 
| 61 | 
            +
                  #   self.headlines.each do |k,v|
         | 
| 62 | 
            +
                  #     ret << v if k.downcase.start_with?(name)
         | 
| 63 | 
            +
                  #     next if v.headlines.empty?
         | 
| 64 | 
            +
                  #     if depth > 0
         | 
| 65 | 
            +
                  #       q = v.headline_by_name name, (depth - 1)
         | 
| 66 | 
            +
                  #       ret.concat q
         | 
| 67 | 
            +
                  #     end
         | 
| 68 | 
            +
                  #   end
         | 
| 69 | 
            +
                  #   ret
         | 
| 70 | 
            +
                  # end
         | 
| 17 71 |  | 
| 72 | 
            +
                  # headline exists for current headline
         | 
| 73 | 
            +
                  def has_headline? name
         | 
| 74 | 
            +
                    name = name.downcase.gsub(" ", "_")
         | 
| 75 | 
            +
                    self.headlines.each do |k,v|
         | 
| 76 | 
            +
                      return true if k.downcase.start_with?(name)
         | 
| 77 | 
            +
                    end
         | 
| 78 | 
            +
                    false
         | 
| 79 | 
            +
                  end
         | 
| 80 | 
            +
             | 
| 81 | 
            +
                  def to_hash
         | 
| 82 | 
            +
                    ret = {name: self.name, headlines: [], type: self.type}
         | 
| 83 | 
            +
                    self.headlines.each do |headline_name, headline|
         | 
| 84 | 
            +
                      ret[:headlines] << headline.to_hash
         | 
| 85 | 
            +
                    end
         | 
| 86 | 
            +
                    ret
         | 
| 87 | 
            +
                  end
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                  def to_pretty_json
         | 
| 90 | 
            +
                    JSON.pretty_generate self.to_hash
         | 
| 91 | 
            +
                  end
         | 
| 92 | 
            +
             | 
| 93 | 
            +
                  protected 
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                  # filter nested headlines (elements) from a parent headline (by name)
         | 
| 96 | 
            +
                  def nested_headlines headlines, name, original_level
         | 
| 97 | 
            +
                    ret = {}
         | 
| 98 | 
            +
                    init_level = nil
         | 
| 99 | 
            +
                    # iterate headlines, skip already done onces
         | 
| 100 | 
            +
                    #headlines.drop(headline_index + 1).each do |headline|
         | 
| 101 | 
            +
                    headlines.to_a.each do |name, value|
         | 
| 102 | 
            +
                      level = LEVEL.index value.first.first.previous.name
         | 
| 103 | 
            +
                      init_level ||= level          
         | 
| 104 | 
            +
                      # lower level indicate nest end
         | 
| 105 | 
            +
                      break if level <= original_level
         | 
| 106 | 
            +
                      break if level < init_level
         | 
| 107 | 
            +
                      # higher level indicates nested items, these will be processed recursive
         | 
| 108 | 
            +
                      next if init_level != level
         | 
| 109 | 
            +
                      ret[name] = value
         | 
| 110 | 
            +
                    end
         | 
| 111 | 
            +
                    ret
         | 
| 112 | 
            +
                  end
         | 
| 18 113 |  | 
| 19 114 | 
             
                end
         | 
| 20 115 |  |