embulk-filter-crawler 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +5 -5
- data/build.gradle +1 -1
- data/src/main/java/org/embulk/filter/crawler/CrawlerFilterPlugin.java +14 -0
- metadata +3 -3
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA1:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 30a79fc0749463b0dce27c96cb1e32f6cdced9f2
         | 
| 4 | 
            +
              data.tar.gz: 80db4bafe0d5b0418aa8cf18be52c8cfe66f2e95
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 7c7df4cb9d0e103a8dc17e34ee90be45da7c659f5a0a0fc06d28215854dc12b026de9e4eaf4b65354c3befcc191c0a60efc33bd3c0c39ba4ac77eb3a0d327760
         | 
| 7 | 
            +
              data.tar.gz: fc890159a20819a203eae0ae61eb4cce3a9b8bc11d2e8903a61bd5a59ba1ccf451dd8f58e5e918eeb0d1f0f07fcff2138491c343c1eb7a1b52e01e219816614a
         | 
    
        data/README.md
    CHANGED
    
    | @@ -10,13 +10,14 @@ Write short description here and build.gradle file. | |
| 10 10 |  | 
| 11 11 | 
             
            - **target_key**: base_url column key name (string, require)
         | 
| 12 12 | 
             
            - **max_depth_of_crawling**: max depth of crawling (integer, default: unlimited)
         | 
| 13 | 
            -
            - ** | 
| 14 | 
            -
            - ** | 
| 15 | 
            -
            - **max_pages_to_fetch**: max_pages_to_fetch (string, default: unlimited)
         | 
| 13 | 
            +
            - **number_of_crawlers**: parallelism (integer, default: 1)
         | 
| 14 | 
            +
            - **max_pages_to_fetch**: max_pages_to_fetch (integer, default: unlimited)
         | 
| 16 15 | 
             
            - **crawl_storage_folder**: crawl_storage_folder (string, require)
         | 
| 17 16 | 
             
            - **politeness_delay**: politeness_delay (integer, default: null)
         | 
| 18 17 | 
             
            - **user_agent_string**: user_agent_string (string, default: null)
         | 
| 19 | 
            -
            - ** | 
| 18 | 
            +
            - **output_prefix**: output_prefix (string, default: "")
         | 
| 19 | 
            +
            - **connection_timeout**: connection timeout millisecond (integer, default: 30000)
         | 
| 20 | 
            +
            - **socket_timeout**: socket timeout millisecond (integer, default: 20000)
         | 
| 20 21 |  | 
| 21 22 | 
             
            ## Example
         | 
| 22 23 |  | 
| @@ -33,7 +34,6 @@ filters: | |
| 33 34 | 
             
              - type: crawler
         | 
| 34 35 | 
             
                target_key: url
         | 
| 35 36 | 
             
                number_of_crawlers: 10
         | 
| 36 | 
            -
                seed_size: 100
         | 
| 37 37 | 
             
                max_depth_of_crawling: 4
         | 
| 38 38 | 
             
                politeness_delay: 100
         | 
| 39 39 | 
             
                crawl_storage_folder: "/tmp/crawl/%s"
         | 
    
        data/build.gradle
    CHANGED
    
    
| @@ -74,6 +74,14 @@ public class CrawlerFilterPlugin | |
| 74 74 | 
             
                    @Config("should_not_visit_pattern")
         | 
| 75 75 | 
             
                    @ConfigDefault("null")
         | 
| 76 76 | 
             
                    public Optional<String> getShouldNotVisitPattern();
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                    @Config("connection_timeout")
         | 
| 79 | 
            +
                    @ConfigDefault("null")
         | 
| 80 | 
            +
                    public Optional<Integer> getConnectionTimeout();
         | 
| 81 | 
            +
             | 
| 82 | 
            +
                    @Config("socket_timeout")
         | 
| 83 | 
            +
                    @ConfigDefault("null")
         | 
| 84 | 
            +
                    public Optional<Integer> getSocketTimeout();
         | 
| 77 85 | 
             
                }
         | 
| 78 86 |  | 
| 79 87 | 
             
                @Override
         | 
| @@ -202,6 +210,12 @@ public class CrawlerFilterPlugin | |
| 202 210 | 
             
                            if (task.getUserAgentString().isPresent()) {
         | 
| 203 211 | 
             
                                config.setUserAgentString(task.getUserAgentString().get());
         | 
| 204 212 | 
             
                            }
         | 
| 213 | 
            +
                            if (task.getSocketTimeout().isPresent()) {
         | 
| 214 | 
            +
                                config.setSocketTimeout(task.getSocketTimeout().get());
         | 
| 215 | 
            +
                            }
         | 
| 216 | 
            +
                            if (task.getConnectionTimeout().isPresent()) {
         | 
| 217 | 
            +
                                config.setConnectionTimeout(task.getConnectionTimeout().get());
         | 
| 218 | 
            +
                            }
         | 
| 205 219 |  | 
| 206 220 | 
             
                            PageFetcher pageFetcher = new PageFetcher(config);
         | 
| 207 221 | 
             
                            RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
         | 
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: embulk-filter-crawler
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0.1. | 
| 4 | 
            +
              version: 0.1.1
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - toyama0919
         | 
| 8 8 | 
             
            autorequire:
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2016-03- | 
| 11 | 
            +
            date: 2016-03-26 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              requirement: !ruby/object:Gem::Requirement
         | 
| @@ -73,7 +73,7 @@ files: | |
| 73 73 | 
             
            - classpath/commons-logging-1.2.jar
         | 
| 74 74 | 
             
            - classpath/crawler4j-4.2.jar
         | 
| 75 75 | 
             
            - classpath/dom4j-1.6.1.jar
         | 
| 76 | 
            -
            - classpath/embulk-filter-crawler-0.1. | 
| 76 | 
            +
            - classpath/embulk-filter-crawler-0.1.1.jar
         | 
| 77 77 | 
             
            - classpath/fontbox-1.8.4.jar
         | 
| 78 78 | 
             
            - classpath/geronimo-stax-api_1.0_spec-1.0.1.jar
         | 
| 79 79 | 
             
            - classpath/httpclient-4.4.jar
         |