embulk-filter-crawler 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA1:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 52c2e67425310d86c6695fecbd4eeec2e80bc1f7
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: 12615ac3b93eb0eef4dd7c057f2508b070770611
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 691bdc10a98944c09892392eab8187d14209cc00724d235c4fdab2d5d66f202254133da858a23a11371142941eb4bb060dda46960a529d5fb6e1091e4ef2ba9f
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: 4c5d5da29cbda4ba32f2450aeea928f3779179634565450214494d9ca9a29de41f63f1df536e15cac6f4ce98c3c1e32242396c2d13df740b00628d8f8b30219a
         
     | 
    
        data/build.gradle
    CHANGED
    
    
| 
         @@ -19,6 +19,7 @@ import org.embulk.spi.PageOutput; 
     | 
|
| 
       19 
19 
     | 
    
         
             
            import org.embulk.spi.PageReader;
         
     | 
| 
       20 
20 
     | 
    
         
             
            import org.embulk.spi.Schema;
         
     | 
| 
       21 
21 
     | 
    
         
             
            import org.embulk.spi.type.Types;
         
     | 
| 
      
 22 
     | 
    
         
            +
            import org.slf4j.Logger;
         
     | 
| 
       22 
23 
     | 
    
         | 
| 
       23 
24 
     | 
    
         
             
            import com.google.common.base.Optional;
         
     | 
| 
       24 
25 
     | 
    
         
             
            import com.google.common.collect.ImmutableList;
         
     | 
| 
         @@ -34,6 +35,8 @@ import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer; 
     | 
|
| 
       34 
35 
     | 
    
         
             
            public class CrawlerFilterPlugin
         
     | 
| 
       35 
36 
     | 
    
         
             
                    implements FilterPlugin
         
     | 
| 
       36 
37 
     | 
    
         
             
            {
         
     | 
| 
      
 38 
     | 
    
         
            +
                private static final Logger logger = Exec.getLogger(CrawlerFilterPlugin.class);
         
     | 
| 
      
 39 
     | 
    
         
            +
             
     | 
| 
       37 
40 
     | 
    
         
             
                public interface PluginTask
         
     | 
| 
       38 
41 
     | 
    
         
             
                        extends Task
         
     | 
| 
       39 
42 
     | 
    
         
             
                {
         
     | 
| 
         @@ -131,21 +134,10 @@ public class CrawlerFilterPlugin 
     | 
|
| 
       131 
134 
     | 
    
         
             
                    return new PageOutput() {
         
     | 
| 
       132 
135 
     | 
    
         
             
                        private PageReader reader = new PageReader(inputSchema);
         
     | 
| 
       133 
136 
     | 
    
         
             
                        private PageBuilder builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
         
     | 
| 
       134 
     | 
    
         
            -
                        private CrawlController controller = getController();
         
     | 
| 
       135 
137 
     | 
    
         | 
| 
       136 
138 
     | 
    
         
             
                        @Override
         
     | 
| 
       137 
139 
     | 
    
         
             
                        public void finish()
         
     | 
| 
       138 
140 
     | 
    
         
             
                        {
         
     | 
| 
       139 
     | 
    
         
            -
                            for (Object object : controller.getCrawlersLocalData()) {
         
     | 
| 
       140 
     | 
    
         
            -
                                CrawlStat crawlStat = (CrawlStat) object;
         
     | 
| 
       141 
     | 
    
         
            -
                                for (Map<String, Object> map : crawlStat.getPages()) {
         
     | 
| 
       142 
     | 
    
         
            -
                                    for (Column outputColumn : outputSchema.getColumns()) {
         
     | 
| 
       143 
     | 
    
         
            -
                                        final Object value = map.get(outputColumn.getName());
         
     | 
| 
       144 
     | 
    
         
            -
                                        setValue(value, outputColumn);
         
     | 
| 
       145 
     | 
    
         
            -
                                    }
         
     | 
| 
       146 
     | 
    
         
            -
                                    builder.addRecord();
         
     | 
| 
       147 
     | 
    
         
            -
                                }
         
     | 
| 
       148 
     | 
    
         
            -
                            }
         
     | 
| 
       149 
141 
     | 
    
         
             
                            builder.finish();
         
     | 
| 
       150 
142 
     | 
    
         
             
                        }
         
     | 
| 
       151 
143 
     | 
    
         | 
| 
         @@ -158,6 +150,7 @@ public class CrawlerFilterPlugin 
     | 
|
| 
       158 
150 
     | 
    
         
             
                        @Override
         
     | 
| 
       159 
151 
     | 
    
         
             
                        public void add(Page page)
         
     | 
| 
       160 
152 
     | 
    
         
             
                        {
         
     | 
| 
      
 153 
     | 
    
         
            +
                            CrawlController controller = getController();
         
     | 
| 
       161 
154 
     | 
    
         
             
                            reader.setPage(page);
         
     | 
| 
       162 
155 
     | 
    
         
             
                            while (reader.nextRecord()) {
         
     | 
| 
       163 
156 
     | 
    
         
             
                                controller.addSeed(reader.getString(keyNameColumn));
         
     | 
| 
         @@ -169,6 +162,16 @@ public class CrawlerFilterPlugin 
     | 
|
| 
       169 
162 
     | 
    
         
             
                            }
         
     | 
| 
       170 
163 
     | 
    
         
             
                            controller.setCustomData(customData);
         
     | 
| 
       171 
164 
     | 
    
         
             
                            controller.start(EmbulkCrawler.class, task.getNumberOfCrawlers());
         
     | 
| 
      
 165 
     | 
    
         
            +
                            for (Object object : controller.getCrawlersLocalData()) {
         
     | 
| 
      
 166 
     | 
    
         
            +
                                CrawlStat crawlStat = (CrawlStat) object;
         
     | 
| 
      
 167 
     | 
    
         
            +
                                for (Map<String, Object> map : crawlStat.getPages()) {
         
     | 
| 
      
 168 
     | 
    
         
            +
                                    for (Column outputColumn : outputSchema.getColumns()) {
         
     | 
| 
      
 169 
     | 
    
         
            +
                                        final Object value = map.get(outputColumn.getName());
         
     | 
| 
      
 170 
     | 
    
         
            +
                                        setValue(value, outputColumn);
         
     | 
| 
      
 171 
     | 
    
         
            +
                                    }
         
     | 
| 
      
 172 
     | 
    
         
            +
                                    builder.addRecord();
         
     | 
| 
      
 173 
     | 
    
         
            +
                                }
         
     | 
| 
      
 174 
     | 
    
         
            +
                            }
         
     | 
| 
       172 
175 
     | 
    
         
             
                        }
         
     | 
| 
       173 
176 
     | 
    
         | 
| 
       174 
177 
     | 
    
         
             
                        /**
         
     | 
| 
         @@ -91,7 +91,7 @@ public class EmbulkCrawler extends WebCrawler 
     | 
|
| 
       91 
91 
     | 
    
         
             
                        BinaryParseData binaryParseData = (BinaryParseData) parseData;
         
     | 
| 
       92 
92 
     | 
    
         
             
                        map.put(outputPrefix + Constants.HTML, binaryParseData.getHtml());
         
     | 
| 
       93 
93 
     | 
    
         
             
                    }
         
     | 
| 
       94 
     | 
    
         
            -
                    logger. 
     | 
| 
      
 94 
     | 
    
         
            +
                    logger.debug("{}", webURL.getURL());
         
     | 
| 
       95 
95 
     | 
    
         
             
                    myCrawlStat.pages.add(map);
         
     | 
| 
       96 
96 
     | 
    
         
             
                }
         
     | 
| 
       97 
97 
     | 
    
         | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,14 +1,14 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: embulk-filter-crawler
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0.1. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.1.2
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - toyama0919
         
     | 
| 
       8 
8 
     | 
    
         
             
            autorequire:
         
     | 
| 
       9 
9 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       10 
10 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       11 
     | 
    
         
            -
            date: 2016- 
     | 
| 
      
 11 
     | 
    
         
            +
            date: 2016-04-04 00:00:00.000000000 Z
         
     | 
| 
       12 
12 
     | 
    
         
             
            dependencies:
         
     | 
| 
       13 
13 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       14 
14 
     | 
    
         
             
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
         @@ -73,7 +73,7 @@ files: 
     | 
|
| 
       73 
73 
     | 
    
         
             
            - classpath/commons-logging-1.2.jar
         
     | 
| 
       74 
74 
     | 
    
         
             
            - classpath/crawler4j-4.2.jar
         
     | 
| 
       75 
75 
     | 
    
         
             
            - classpath/dom4j-1.6.1.jar
         
     | 
| 
       76 
     | 
    
         
            -
            - classpath/embulk-filter-crawler-0.1. 
     | 
| 
      
 76 
     | 
    
         
            +
            - classpath/embulk-filter-crawler-0.1.2.jar
         
     | 
| 
       77 
77 
     | 
    
         
             
            - classpath/fontbox-1.8.4.jar
         
     | 
| 
       78 
78 
     | 
    
         
             
            - classpath/geronimo-stax-api_1.0_spec-1.0.1.jar
         
     | 
| 
       79 
79 
     | 
    
         
             
            - classpath/httpclient-4.4.jar
         
     |