evalscope 0.5.3__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (25) hide show
  1. evalscope/backend/opencompass/backend_manager.py +2 -0
  2. evalscope/backend/vlm_eval_kit/backend_manager.py +1 -1
  3. evalscope/benchmarks/benchmark.py +1 -1
  4. evalscope/evaluator/evaluator.py +3 -3
  5. evalscope/models/api/__init__.py +3 -0
  6. evalscope/models/api/openai_api.py +228 -0
  7. evalscope/perf/http_client.py +5 -5
  8. evalscope/third_party/longbench_write/__init__.py +3 -0
  9. evalscope/third_party/longbench_write/eval.py +284 -0
  10. evalscope/third_party/longbench_write/infer.py +217 -0
  11. evalscope/third_party/longbench_write/longbench_write.py +88 -0
  12. evalscope/third_party/longbench_write/resources/__init__.py +1 -0
  13. evalscope/third_party/longbench_write/resources/judge.txt +31 -0
  14. evalscope/third_party/longbench_write/resources/longbench_write.jsonl +120 -0
  15. evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +60 -0
  16. evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +48 -0
  17. evalscope/third_party/longbench_write/tools/__init__.py +1 -0
  18. evalscope/third_party/longbench_write/tools/data_etl.py +155 -0
  19. evalscope/third_party/longbench_write/utils.py +37 -0
  20. evalscope/version.py +2 -2
  21. {evalscope-0.5.3.dist-info → evalscope-0.5.4.dist-info}/METADATA +24 -32
  22. {evalscope-0.5.3.dist-info → evalscope-0.5.4.dist-info}/RECORD +25 -11
  23. {evalscope-0.5.3.dist-info → evalscope-0.5.4.dist-info}/WHEEL +0 -0
  24. {evalscope-0.5.3.dist-info → evalscope-0.5.4.dist-info}/entry_points.txt +0 -0
  25. {evalscope-0.5.3.dist-info → evalscope-0.5.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,60 @@
1
+ {"prompt": "Write an outline for a short 100-word blog post about why Christmas Cactus are a great buy.", "type": "Community Forum", "length": 100}
2
+ {"prompt": "write a 100-word short thank you email to a bank teller on his consultation on the investment products", "type": "Functional Writing", "length": 100}
3
+ {"prompt": "write a description of the meaning of the sun in astrology, write it in approximately 150 words", "type": "Popular Science", "length": 150}
4
+ {"prompt": "Please introduce at lease 5 bad habits in reading in English. The content should be no less than 200 words.", "type": "Popular Science", "length": 200}
5
+ {"prompt": "Write a 200 words abstract for the paper: A Study on the Types and Techniques of Female Images in Online Horror Novels", "type": "Academic and Monograph", "length": 200}
6
+ {"prompt": "Write a 200 words dialogue between two girls. One is extremely drunk and can't get any words out properly.", "type": "Literature and Creative Writing", "length": 200}
7
+ {"prompt": "Generate a 300 words newsletter about artificial intelligence for me.", "type": "News Report", "length": 300}
8
+ {"prompt": "Write a list of 15 emojis relating to various emotions, and for each emoji-emotion, write a dialog line for Nick Wilde for Zootopia where he is demonstrating that emotion", "type": "Literature and Creative Writing", "length": 300}
9
+ {"prompt": "We are facing low quality software deliveries from vendor. Now I am drafting a 400 words proposal to eliminate this problem. Give me a proposal of this document.", "type": "Functional Writing", "length": 400}
10
+ {"prompt": "Coaching and give guidance to Supply Engineer, Manage and attest project specifications and VO price which is consolidated by Supply Engineer; at least 400 words.", "type": "Functional Writing", "length": 400}
11
+ {"prompt": "You are a food blogger who recently tasted a delicious dish at a restaurant or place in Mexico. Please share your dining experience and why you recommend this dish:\n\nFood Characteristics: Describe the taste and features of the dish.\n\nPersonal Experience: Share your dining impressions and feelings.\n\nReason for Recommendation: Explain why you think this dish is worth trying. Approximately 400 words.", "type": "Community Forum", "length": 400}
12
+ {"prompt": "Blog Question:\n\"I am 25 years old with a high school diploma and have been unemployed for two years. What direction should I take now that I have awakened?\n\nSince dropping out of college at 19, I have accomplished nothing and lived aimlessly. I never understood why others worked so hard and just drifted through life. But after years of decay and avoidance, enduring the indifference of relatives and friends, I realized I have only been making things hard for myself. I am tired of living a life that’s neither living well nor dying. Now, it feels like I have awakened from a long dream, and I want to work hard to improve my life, but I don’t know which direction to take.\"\nPlease write a 400 words response to encourage the blogger and provide some suggestions.", "type": "Community Forum", "length": 400}
13
+ {"prompt": "You are a creative media editor. Please write a news report based on the following requirements:\n\nReport Theme: In the autumn season, citizens are actively participating in outdoor health activities such as hiking and running, showcasing the community's vitality in the crisp autumn weather.\n\nNews Highlights: Emphasize the positive health impacts of autumn exercise and the enthusiasm of the public for participating in outdoor activities.\n\nArticle Creativity: Create an eye-catching, vibrant news piece that aligns with the theme of autumn exercise. At least 400 words.", "type": "News Report", "length": 400}
14
+ {"prompt": "Combining the key concerns of businesses when selecting a platform for investment and ecological services, and highlighting the platform's commitment to helping companies and talent achieve value growth more effectively and efficiently, as well as its values of sharing and mutual benefit, draft a 400-word attractive press release.", "type": "News Report", "length": 400}
15
+ {"prompt": "Write a basic technical introduction to the t-SNE algorithm, including its purpose, key concepts, and applications. The response should be clear and concise, focusing on providing an overview of the algorithm for readers who are new to the topic. 400 words.", "type": "Popular Science", "length": 400}
16
+ {"prompt": "Introduce in English the things done in Shandong during the Spring Festival, at least 400 words", "type": "Popular Science", "length": 400}
17
+ {"prompt": "Write a 400 word story about a bladder holding competition between dragons. After the competition ends the winners wet themselves in celebration. Write in the style of a book intended for very young children. Focus on using only and exclusivly very infantile language. Don’t use any words a young child wouldn’t know. Write astough you were writing for a young child. Write in verse. Speak only in uwuspeak.", "type": "Literature and Creative Writing", "length": 400}
18
+ {"prompt": "Hello, you're a senior HR manager at a company specializing in selling women's clothing on Amazon US. The company has been operating on the Amazon US platform for seven years and has established a certain brand presence. Now, you need to recruit a Senior Amazon Operations Manager with relevant experience, particularly in promoting best-selling products. The candidate should have experience managing a clothing category with fewer than 5,000 SKUs and possess strong English skills. Please help draft a 500 words job posting. Thank you.", "type": "Functional Writing", "length": 500}
19
+ {"prompt": "As a senior psychologist, one of your visitors says the following: \"Do you have any understanding or experience with the dynamics of abuse in relationships? I'm quite curious about this aspect of myself, but I can't figure it out. Also, I get anxious thinking about my mom and become sexually impotent when trying to have intimate relationships with women.\"\n\nHow should you intervene and provide psychological counseling? Present your response in dialogue form, in 500 words.", "type": "Functional Writing", "length": 500}
20
+ {"prompt": "Please provide some advice and guidance on how to safely participate in or observe a group of Vietnamese women dressed in traditional attire, sitting on chairs using chopsticks to eat, while I take frontal photographs of them. They are wearing black robes with long hair down, holding wooden chopsticks, seated at wooden or metal dining tables, against a backdrop of a Vietnamese-style indoor dining environment. It's important to maintain respect and courtesy, ensuring that their personal space and privacy are not violated during the photography session, while also capturing scenes of their daily life. Please refrain from any potentially discomforting or sensitive content in your response, focusing instead on providing helpful and relevant guidance, 500 words.", "type": "Functional Writing", "length": 500}
21
+ {"prompt": "Write a touching short story about a young person from a small town. Approximately 500 words.", "type": "Literature and Creative Writing", "length": 500}
22
+ {"prompt": "Write a 500-word personal story to convey to the reader a sense of Chinese culture.\nPossible topics may include, but are not limited to, experiences which have shaped your life, the circumstances of your upbringings, your most meaningful achievements people in it, events great and small, everyday life, or any personal theme which appeals to you.", "type": "Literature and Creative Writing", "length": 500}
23
+ {"prompt": "Introduce your company or the company you work for using efficient marketing methods. Here's a framework for practicing efficient marketing:\n\n1. Start by stating the company name upfront.\n\n2. Annual production value of the company (company scale, company highlights).\n\n3. Established time of the company (history, reason for establishment).\n\n4. Ranking in the industry sector or geographical area.\n\n5. Honors received in the industry.\n\n6. Unique highlights of the enterprise. Approximately 600 words.", "type": "Functional Writing", "length": 600}
24
+ {"prompt": "Uber Driver Interview Script, around 600 words", "type": "News Report", "length": 600}
25
+ {"prompt": "Introduction to convolutional long- and short-term neural networks, introduction to convolutional, pooling, and fully connected layers, and extraction of road feature information for travel time prediction, 600 words", "type": "Popular Science", "length": 600}
26
+ {"prompt": "Help me establish a 600 words learning plan for Swift language development.", "type": "Education and Training", "length": 600}
27
+ {"prompt": "You are a soon-to-be-married individual, please prepare a 800-word speech according to the following requirements:\n\nWords of gratitude to parents: Express gratitude to parents and loved ones\n\nWords to the partner: Express love and commitment to the partner\n\nFuture prospects: Future prospects and blessings", "type": "Functional Writing", "length": 800}
28
+ {"prompt": "Write a 800 words Chinese news for CEVA acquiring Wincanton plc based on below information, realign the logic in standard and decent Chinese and also name a vivid title for this Chinese news:\n-\tCEVA announces it has reached an agreement on the terms of an offer for the acquisition of Wincanton plc in January.\n \n-\tWincanton plc' s renowned expertise in designing supply chain solutions for customers in retail, grocery, eCommerce, construction, infrastructure, energy and defense sectors would enable CEVA Logistics to further expand and diversify its contract logistics customer base. \n-\t Currently listed on the London Stock Exchange, Wincanton plc is a leading supply chain partner for British and Irish business. A trusted partner to many of UK and Ireland brands and public bodies, with almost 100 years’ heritage, more than 170 sites and 20,300 employees across the region, Wincanton plc provides services across its customers' entire supply chains in a wide range of sectors, from grocery and consumer goods to industry, energy and defense.\n-\tThe combination between Wincanton plc and CEVA Logistics would enable to: \n· Develop CEVA Logistics' presence in the UK and Ireland, as Wincanton plc is operating a network with 5,000 drivers and 8,500 trucks/trailers;\n· Build on Wincanton plc' s proven expertise and network to diversify CEVA Logistics' customer base;\n· Bring to Wincanton plc the support of the CMA CGM Group and its worldwide expertise in end-to-end logistics solutions.\n", "type": "News Report", "length": 800}
29
+ {"prompt": "write a 800 words report about refugee problems in Sudan and South Sudan", "type": "News Report", "length": 800}
30
+ {"prompt": "Based on someone, something, a sentence, or a detail that has deeply moved you, write a short story that reflects your dreams and mission. Focus on concrete imagery and key words to create vivid scenes. Approximately 800 words.", "type": "Literature and Creative Writing", "length": 800}
31
+ {"prompt": "Please provide a detailed description of the process for configuring DPD simulation parameters. It should be no less than 1000 words.", "type": "Popular Science", "length": 1000}
32
+ {"prompt": "Rational Suggestions for Hydraulic Project Engineering Cost Consultation, 1200 words.", "type": "Functional Writing", "length": 1200}
33
+ {"prompt": "Please provide detailed information about root canal treatment, including its purpose, procedure, key points of postoperative care, and important considerations for patients. Ensure that the response focuses on practical information, offering helpful and safe advice for patients. 1200 words.", "type": "Popular Science", "length": 1200}
34
+ {"prompt": "Best Practices and Case Studies of OKR\n\n1. Implementation Experiences of OKR in Well-known Companies like Google, Intel, etc.\n2. Key Success Factors in Implementing OKR\n3. Common Issues and Solutions\n\nPlease help expand on these topics to create a teaching script around 1200 words.", "type": "Education and Training", "length": 1200}
35
+ {"prompt": "Please write a 1200-word research report. The topic is whether Harvard University discriminates against Asian American students.\n You should refer to the percentage of Harvard admissions by race after taking into account various factors. The proportion of Asian American, Aferican American, International, Hispanic, Native American and Unknown is (1) 43.04%, 0.67%, 7.27%, 2.42%, 0.21%, 8.02% when considering academic only, (2) 31.40%, 1.83%, 5.86%, 2.62%, 0.32%, 9.93% when considering legacy and athlete, (3) 25.99%, 2.36%, 7.39%, 4.07%, 0.41%, 9.14% when considering extracurricular and personal, (4) 17.97%, 11.12%, 7.68%, 9.83%, 1.21%, 8.11% when considering demographics, (5) 18.66%, 10.46%, 8.90%, 9.46%, 1.23%, 8.09% in actual. you can also use any other references you like.\n Give your opinion and reasons on this topic.", "type": "Academic and Monograph", "length": 1200}
36
+ {"prompt": "Background of the research on the Design and Implementation of an Online Food Supermarket Management System Using the SSM Framework. Approximately 1200 words.", "type": "Academic and Monograph", "length": 1200}
37
+ {"prompt": "Please provide a detailed description of dinosaurs, including their characteristics, life cycle, species diversity, habitats, and the main reasons for their extinction on Earth. Additionally, discuss the significance of dinosaurs in modern society and their impact on culture and science. At least 1600 words.", "type": "Popular Science", "length": 1600}
38
+ {"prompt": "As a Java expert blogger, please write a Stack Exchange blog post about implementing hot-pluggable AOP in Spring Boot. The post should be no less than 2000 words and provide specific examples.", "type": "Community Forum", "length": 2000}
39
+ {"prompt": "Explanation of the Task:\nIn culmination of our exploration of themes of friendship, bravery, and imagination through the novel Freak the Mighty and our study of Arthurian Quest Narratives, you will have the opportunity to showcase your understanding and creativity through a timed hand-written creative story.\n\nRole:\nYou are a budding writer tasked with crafting an engaging narrative that explores the themes of friendship, bravery, and imagination. Your story should captivate your readers and transport them into a world filled with wonder and excitement.\n\nAudience:\nYour audience consists of your fellow classmates who will read and provide feedback on your story. Consider how you can engage and entertain them with your tale.\n\nFormat:\nYour story should be handwritten and include properly formatted dialogue and descriptive details. Remember to use paragraphs to organize your ideas and make your story easy to follow. \n\nTopic:\nYour story must revolve around a daily task that is transformed into a quest, much like Max and Freak's imaginative adventures in Freak the Mighty. Infuse your narrative with elements of adventure, discovery, and friendship, drawing inspiration from both the novel and the Arthurian Quest Narrative genre. 2000 words.\nwrite a story based on these requirement", "type": "Literature and Creative Writing", "length": 2000}
40
+ {"prompt": "Help me write a teaching plan of no less than 2000 words. Search online for third-grade mathematics, and the teaching plan should mainly include the following content: 1. Preliminary analysis; 2. Textbook analysis; 3. Teaching objectives; 4. Teaching challenges; 5. Teaching strategies; 6. Teaching schedule.", "type": "Education and Training", "length": 2000}
41
+ {"prompt": "Please write a 2400 words thesis on the telecom operator service assurance system, covering service activation, assurance, operation and maintenance management, handheld provisioning, and smart operation and maintenance.", "type": "Academic and Monograph", "length": 2400}
42
+ {"prompt": "Write a extremely scary ghost story in the first person, with clear logic and engaging content. Approximately 2500 words.", "type": "Literature and Creative Writing", "length": 2500}
43
+ {"prompt": "Write the research report of no less than 2500 words on the topic: Reporting on college students' spending habits\nFollow the structure of a research report:\nObjective\nMethods\nFindings\nExplanations\nConclusion\n", "type": "Academic and Monograph", "length": 2500}
44
+ {"prompt": "Write a first-person detective story featuring a locked-room murder case in 2800 words. The story should be tightly plotted with strong suspense.", "type": "Literature and Creative Writing", "length": 2800}
45
+ {"prompt": "A Hong Kong Travel Guide, approximately 3000 words", "type": "Community Forum", "length": 3000}
46
+ {"prompt": "Write an instructional article about electromagnetic induction for students who are new to studying physics. Approximately 3000 words.", "type": "Education and Training", "length": 3000}
47
+ {"prompt": "Write a 3000 words paper with title: Research on the Significance of Observing Thrombosis in Colorectal Tissue Using HE", "type": "Academic and Monograph", "length": 3000}
48
+ {"prompt": "Write a tragic love story about a lord's daughter falling in love with a servant, 3500 words.", "type": "Literature and Creative Writing", "length": 3500}
49
+ {"prompt": "帮我用英文写一篇关于郭沫若生平的文章,不少于4000字。", "type": "Popular Science", "length": 4000}
50
+ {"prompt": "Write a 4000 words introduction about panda", "type": "Popular Science", "length": 4000}
51
+ {"prompt": "Write 5000 words essay about walking in sprit", "type": "Academic and Monograph", "length": 5000}
52
+ {"prompt": "Write a scene of a fight between Chandler Hallow and Jimmy Donaldson from the MrBeast channel. Describe everything in great detail. Write a fight based on the Hunger Games. Write a bad ending. Write a text of more than 5000 words", "type": "Literature and Creative Writing", "length": 5000}
53
+ {"prompt": "A zombie virus breaks out in a small town. Write a 5000-word story about a group of survivors trying to escape the town. The story should be told from the perspective. Epic & Bad ending: Only a subset of them survive, and the main character died fighting.", "type": "Literature and Creative Writing", "length": 5000}
54
+ {"prompt": "Title: Research on the Application and Effectiveness of Educational Games in Elementary School Mathematics Education\nAbstract\nThis paper aims to explore the application and effectiveness of educational games in elementary school mathematics education. Through a review of relevant literature and analysis of empirical research, it is found that educational games offer numerous advantages in elementary mathematics education. They can stimulate students' interest in learning, enhance learning motivation, and improve the understanding and application of mathematical concepts. Additionally, the paper discusses the key factors in the design and implementation of educational games and provides corresponding suggestions to offer theoretical and practical guidance for the effective application of educational games in elementary mathematics education.\n\nKeywords: educational games; elementary school mathematics education; application effectiveness; learning interest; learning motivation\n\n1. Introduction\nWith the continuous development of information technology and changes in educational concepts, educational games, as an innovative teaching method, have gradually attracted widespread attention. In elementary school mathematics education, how to use educational games to improve students' learning outcomes has become a hot topic of research. This paper will explore the application and effectiveness of educational games in elementary mathematics education, aiming to provide some theoretical and practical guidance for educational practice.\n2. Advantages of Educational Games in Elementary Mathematics Education\n2.1 Stimulating Interest in Learning\n2.2 Enhancing Learning Motivation\n2.3 Promoting Understanding and Application of Mathematical Concepts\n3. Key Factors in the Design and Implementation of Educational Games\n3.1 Design and Richness of Game Content\n3.2 Setting and Adaptability of Game Difficulty\n3.3 Effective Integration of Games and Curriculum\n3.4 Teacher Training and Support\n5. Suggestions for the Application of Educational Games in Elementary Mathematics Education\n5.1 Designing Targeted Educational Games Based on Teaching Content\n5.2 Fully Utilizing the Feedback Mechanism in Games\n5.3 Strengthening Teacher Training and Support\n5.4 Actively Communicating and Cooperating with Parents\n6. Conclusion\nAs an innovative teaching method, educational games have significant application value in elementary school mathematics education. Through this study, we can find that educational games can effectively enhance students' interest and motivation in learning, and promote the understanding and application of mathematical concepts. However, the design and implementation of educational games need to comprehensively consider many factors to maximize their benefits. It is believed that with the continuous in-depth research and accumulation of practical experience, educational games will bring more development opportunities to elementary school mathematics education.\n\nWrite a 6000-words paper based on the above outline.\n\n", "type": "Academic and Monograph", "length": 6000}
55
+ {"prompt": "Please help me write a 6000 word novel in the crossover genre. It needs to have elements of romance and adventure. There needs to be frequent character dialog to move the plot along", "type": "Literature and Creative Writing", "length": 6000}
56
+ {"prompt": "Write A Song of Ice and Fire fanfiction about Jon Snow and Daenerys Targaryen. The story should be set in the world of Westeros and should include elements of romance, adventure, and political intrigue. The story should be at least 6,000 words long.", "type": "Literature and Creative Writing", "length": 6000}
57
+ {"prompt": "Write 10000-word essay about mathematical fundamentals of Phased Antenna Arrays. Math=true", "type": "Academic and Monograph", "length": 10000}
58
+ {"prompt": "Write an anti-war novel about Sony Wang, a young painter living in Vienna who is drafted into the army in 1916, the more detailed the better. The story should be at least 10000 words long.", "type": "Literature and Creative Writing", "length": 10000}
59
+ {"prompt": "Write a 50-page novel about a man's quest for meaning of life. The story begins in the outskirts of a town named Dilivila. The story begins in April 1982. The story is about Kintu, the protagonist, born in an obscure date in the month of April, 1982. Born amidst the bounties of nature and loving parents, his childhood is a dream lived by a typical lower middle class family plagued by fanacial constraints to some extent and lack of adequate social contact even though he and his parents live in a joyous socity with all the ingredients and elements one needs. He grows up to be a shy boy , hesisitating to interact with others but excels in studies. At age six or seven , he meets the girl named Chinmoyee who becomes his friend and play mate. Later as he advances both in terms of age and intellect, he explores new horizons- where he meets new friend, friends who teaches him bad things like how to smoke a cigarette etc. and of course new girls. Chinmoyee is no longer in the horizon of his life to be specific she is situated in a distant horizon as her owing to the transfer of her father she starts living in a different city. Kintu's studies degraade, he is lost in his own thoughts, he feels helpless as he can't shine in studies and finds no joy in life anymore, then kintu grows up and finally he gets a govt job but the joy of life he lost in early days of his life, is lost foreever. The genre would be melodrama.", "type": "Literature and Creative Writing", "length": 10000}
60
+ {"prompt": "Write 12000 words of an article, about The Top 10 Business Opportunity Benin Republic; How To Start or Register It, How Much Can Be Used In Starting It, The Best Area That the Business will fit, And At last Explaining the future plans Benin has that will Resolve to more opportunities for both foreigners and the people of Benin Republic.", "type": "Popular Science", "length": 12000}
@@ -0,0 +1,48 @@
1
+ {"prompt": "Write a 1000-word China travel guide", "type": "社区论坛", "length": 1000}
2
+ {"prompt": "写一篇1000字的中国旅游指南", "type": "社区论坛", "length": 1000}
3
+ {"prompt": "Write a 1000-word novel about a teenage heroine who grows up and ends up changing the world", "type": "文学与创作", "length": 1000}
4
+ {"prompt": "Write a 1000-word article on the history of the Roman Empire", "type": "科普介绍", "length": 1000}
5
+ {"prompt": "写一篇关于气候变化对全球经济影响的1000字论文", "type": "学术与专著", "length": 1000}
6
+ {"prompt": "写一部讲述一个少女英雄的成长并最终改变世界的1000字小说", "type": "文学与创作", "length": 1000}
7
+ {"prompt": "写一篇介绍罗马帝国历史的1000字文章", "type": "科普介绍", "length": 1000}
8
+ {"prompt": "Write a 1000-word paper on the impact of climate change on the global economy", "type": "学术与专著", "length": 1000}
9
+ {"prompt": "Write a 2000-word novel about a teenage heroine who grows up and ends up changing the world", "type": "文学与创作", "length": 2000}
10
+ {"prompt": "写一部讲述一个少女英雄的成长并最终改变世界的2000字小说", "type": "文学与创作", "length": 2000}
11
+ {"prompt": "写一篇2000字的中国旅游指南", "type": "社区论坛", "length": 2000}
12
+ {"prompt": "写一篇介绍罗马帝国历史的2000字文章", "type": "科普介绍", "length": 2000}
13
+ {"prompt": "Write a 2000-word China travel guide", "type": "社区论坛", "length": 2000}
14
+ {"prompt": "写一篇关于气候变化对全球经济影响的2000字论文", "type": "学术与专著", "length": 2000}
15
+ {"prompt": "Write a 2000-word article on the history of the Roman Empire", "type": "科普介绍", "length": 2000}
16
+ {"prompt": "Write a 2000-word paper on the impact of climate change on the global economy", "type": "学术与专著", "length": 2000}
17
+ {"prompt": "写一篇介绍罗马帝国历史的5000字文章", "type": "科普介绍", "length": 5000}
18
+ {"prompt": "Write a 5000-word paper on the impact of climate change on the global economy", "type": "学术与专著", "length": 5000}
19
+ {"prompt": "Write a 5000-word China travel guide", "type": "社区论坛", "length": 5000}
20
+ {"prompt": "写一篇关于气候变化对全球经济影响的5000字论文", "type": "学术与专著", "length": 5000}
21
+ {"prompt": "Write a 5000-word novel about a teenage heroine who grows up and ends up changing the world", "type": "文学与创作", "length": 5000}
22
+ {"prompt": "Write a 5000-word article on the history of the Roman Empire", "type": "科普介绍", "length": 5000}
23
+ {"prompt": "写一篇5000字的中国旅游指南", "type": "社区论坛", "length": 5000}
24
+ {"prompt": "写一部讲述一个少女英雄的成长并最终改变世界的5000字小说", "type": "文学与创作", "length": 5000}
25
+ {"prompt": "写一篇10000字的中国旅游指南", "type": "社区论坛", "length": 10000}
26
+ {"prompt": "写一部讲述一个少女英雄的成长并最终改变世界的10000字小说", "type": "文学与创作", "length": 10000}
27
+ {"prompt": "Write a 10000-word paper on the impact of climate change on the global economy", "type": "学术与专著", "length": 10000}
28
+ {"prompt": "Write a 10000-word article on the history of the Roman Empire", "type": "科普介绍", "length": 10000}
29
+ {"prompt": "写一篇关于气候变化对全球经济影响的10000字论文", "type": "学术与专著", "length": 10000}
30
+ {"prompt": "写一篇介绍罗马帝国历史的10000字文章", "type": "科普介绍", "length": 10000}
31
+ {"prompt": "Write a 10000-word China travel guide", "type": "社区论坛", "length": 10000}
32
+ {"prompt": "Write a 10000-word novel about a teenage heroine who grows up and ends up changing the world", "type": "文学与创作", "length": 10000}
33
+ {"prompt": "Write a 20000-word article on the history of the Roman Empire", "type": "科普介绍", "length": 20000}
34
+ {"prompt": "写一部讲述一个少女英雄的成长并最终改变世界的20000字小说", "type": "文学与创作", "length": 20000}
35
+ {"prompt": "Write a 20000-word paper on the impact of climate change on the global economy", "type": "学术与专著", "length": 20000}
36
+ {"prompt": "写一篇介绍罗马帝国历史的20000字文章", "type": "科普介绍", "length": 20000}
37
+ {"prompt": "写一篇关于气候变化对全球经济影响的20000字论文", "type": "学术与专著", "length": 20000}
38
+ {"prompt": "Write a 20000-word novel about a teenage heroine who grows up and ends up changing the world", "type": "文学与创作", "length": 20000}
39
+ {"prompt": "Write a 20000-word China travel guide", "type": "社区论坛", "length": 20000}
40
+ {"prompt": "写一篇20000字的中国旅游指南", "type": "社区论坛", "length": 20000}
41
+ {"prompt": "写一篇关于气候变化对全球经济影响的30000字论文", "type": "学术与专著", "length": 30000}
42
+ {"prompt": "写一篇30000字的中国旅游指南", "type": "社区论坛", "length": 30000}
43
+ {"prompt": "Write a 30000-word China travel guide", "type": "社区论坛", "length": 30000}
44
+ {"prompt": "写一篇介绍罗马帝国历史的30000字文章", "type": "科普介绍", "length": 30000}
45
+ {"prompt": "写一部讲述一个少女英雄的成长并最终改变世界的30000字小说", "type": "文学与创作", "length": 30000}
46
+ {"prompt": "Write a 30000-word novel about a teenage heroine who grows up and ends up changing the world", "type": "文学与创作", "length": 30000}
47
+ {"prompt": "Write a 30000-word paper on the impact of climate change on the global economy", "type": "学术与专著", "length": 30000}
48
+ {"prompt": "Write a 30000-word article on the history of the Roman Empire", "type": "科普介绍", "length": 30000}
@@ -0,0 +1 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
@@ -0,0 +1,155 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os.path
3
+ from typing import List
4
+ import re
5
+ import json
6
+
7
+ from evalscope.third_party.longbench_write.eval import EvalLength
8
+ from evalscope.third_party.longbench_write.utils import count_words, chinese_to_arabic
9
+ from evalscope.utils import jsonl_to_list
10
+ from evalscope.utils.logger import get_logger
11
+
12
+ logger = get_logger()
13
+
14
+ """
15
+ This script is used to preprocess the dataset for the LongWriter.
16
+ """
17
+
18
+
19
+ class DataETL:
20
+
21
+ def __init__(self, in_file: str, out_dir: str):
22
+ self.data: List[dict] = jsonl_to_list(in_file)
23
+ logger.info(f'Loaded {len(self.data)} samples from {in_file}')
24
+
25
+ self.out_dir = out_dir
26
+
27
+ @staticmethod
28
+ def filter(strategies: List[str], example: dict, verbose: bool = False) -> dict:
29
+ # example = {'messages': [{'role': 'user', 'content': 'xxx'}, {'role': 'assistant', 'content': 'xxx'}]}
30
+
31
+ for strategy in strategies:
32
+ if strategy == 'no_required_length':
33
+ pattern1 = r'(\d+)字'
34
+ pattern2 = r'(.)百字|(.)千字|(.)万字'
35
+ pattern3 = r'(\d+) words'
36
+
37
+ text = example['messages'][0]['content']
38
+ matches = re.findall(pattern1, text)
39
+ matches.extend(re.findall(pattern2, text))
40
+ matches.extend(re.findall(pattern3, text))
41
+ matches = list(set(matches))
42
+
43
+ final_matches = []
44
+ for item in matches:
45
+ if isinstance(item, tuple):
46
+ # Note: refer to pattern2
47
+ for idx in range(len(item)):
48
+ num = chinese_to_arabic(item[idx])
49
+ if idx == 0 and item[idx]:
50
+ if isinstance(num, int):
51
+ final_matches.append(num * 100)
52
+ elif idx == 1 and item[idx]:
53
+ if isinstance(num, int):
54
+ final_matches.append(num * 1000)
55
+ elif idx == 2 and item[idx]:
56
+ if isinstance(num, int):
57
+ final_matches.append(num * 10000)
58
+ else:
59
+ continue
60
+ else:
61
+ final_matches.append(item)
62
+
63
+ final_matches = list(set(final_matches))
64
+ if verbose and len(final_matches) > 0:
65
+ logger.info(f'>>final_matches: {final_matches}')
66
+
67
+ if len(final_matches) == 1:
68
+ required_length = int(final_matches[0])
69
+ example['length'] = required_length
70
+ else:
71
+ example = {}
72
+ else:
73
+ raise ValueError(f'Unknown strategy: {strategy}')
74
+
75
+ return example
76
+
77
+ def process_filter(self):
78
+ # Entry point for processing the data
79
+ filtered_data = []
80
+ for ex in self.data:
81
+ filtered_ex = DataETL.filter(strategies=['no_required_length'], example=ex, verbose=True)
82
+ if len(filtered_ex) > 0:
83
+ filtered_data.append(filtered_ex)
84
+
85
+ for example in filtered_data:
86
+ assistant_messages = example['messages'][1]
87
+ assert assistant_messages['role'] == 'assistant'
88
+ response_length, _ = count_words(assistant_messages['content'])
89
+ example['response_length'] = response_length
90
+
91
+ # Dump the filtered data to the output file
92
+ # example = {'messages': [{'role': 'user', 'content': 'xxx'}, {'role': 'assistant', 'content': 'xxx'}], 'length': 1000, 'response_length': 1000'}
93
+ out_file = os.path.join(self.out_dir, 'long_filtered.jsonl')
94
+ with open(out_file, 'w', encoding='utf-8') as f:
95
+ for example in filtered_data:
96
+ f.write(json.dumps(example, ensure_ascii=False) + '\n')
97
+
98
+ return out_file
99
+
100
+ def process_eval_length(self, in_file: str, threshold: float = 80):
101
+ data_list = jsonl_to_list(in_file)
102
+
103
+ res_list = []
104
+ x_list = []
105
+ y_list = []
106
+ x_filter_list = []
107
+ y_filter_list = []
108
+ for ex in data_list:
109
+ x = ex['length']
110
+ y = ex['response_length']
111
+ if x == 0 or y == 0:
112
+ continue
113
+
114
+ if x / y > 3:
115
+ print(f'\n>>ex: {json.dumps(ex, ensure_ascii=False)}')
116
+ print(f'>>length: {x}, response_length: {y}\n')
117
+
118
+ x_list.append(x)
119
+ y_list.append(y)
120
+
121
+ len_score = EvalLength.score(x, y)
122
+ if len_score >= threshold:
123
+ ex['score_length'] = len_score
124
+ res_list.append(ex)
125
+
126
+ x_filter_list.append(x)
127
+ y_filter_list.append(y)
128
+
129
+ out_file = os.path.join(self.out_dir, 'long_filtered_length.jsonl')
130
+ logger.info(f'Got {len(res_list)} examples left')
131
+ with open(out_file, 'w', encoding='utf-8') as f:
132
+ for example in res_list:
133
+ f.write(json.dumps(example, ensure_ascii=False) + '\n')
134
+
135
+ plt = EvalLength.plot_img(x_list, y_list)
136
+ plt.savefig(os.path.join(self.out_dir, 'length_scatter.png'))
137
+
138
+ plt_filter = EvalLength.plot_img(x_filter_list, y_filter_list)
139
+ plt_filter.savefig(os.path.join(self.out_dir, 'length_scatter_filtered.png'))
140
+
141
+ return out_file
142
+
143
+
144
+ if __name__ == "__main__":
145
+ # run `no_required_length`: got 1748 exampels left
146
+
147
+ # Refer to: https://modelscope.cn/datasets/ZhipuAI/LongWriter-6k/files
148
+ longbench_write_file = '/path/to/long.jsonl'
149
+ out_dir = '/path/to/your_output_dir'
150
+
151
+ data_etl = DataETL(in_file=longbench_write_file, out_dir=out_dir)
152
+ filtered_file_path = data_etl.process_filter()
153
+ print(f'Filtered data is saved in {filtered_file_path}')
154
+
155
+ eval_l_file_path = data_etl.process_eval_length(in_file=filtered_file_path, threshold=80)
@@ -0,0 +1,37 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ import re
4
+
5
+
6
+ def count_words(text):
7
+ chinese_characters = re.findall(r'[\u4e00-\u9fff]', text)
8
+ english_words = re.findall(r'\b[a-zA-Z]+\b', text)
9
+
10
+ chinese_char_count = len(chinese_characters)
11
+ english_word_count = len(english_words)
12
+
13
+ total_count = chinese_char_count + english_word_count
14
+
15
+ is_chinese = chinese_char_count > english_word_count
16
+
17
+ return total_count, is_chinese
18
+
19
+
20
+ def chinese_to_arabic(chinese_number: str) -> int:
21
+ chinese_numerals = {
22
+ '零': 0,
23
+ '一': 1,
24
+ '二': 2,
25
+ '三': 3,
26
+ '四': 4,
27
+ '五': 5,
28
+ '六': 6,
29
+ '七': 7,
30
+ '八': 8,
31
+ '九': 9,
32
+
33
+ '俩': 2,
34
+ '两': 2,
35
+ }
36
+
37
+ return chinese_numerals.get(chinese_number, chinese_number)
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.5.3'
4
- __release_datetime__ = '2024-08-29 08:00:00'
3
+ __version__ = '0.5.4'
4
+ __release_datetime__ = '2024-09-19 08:00:00'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.5.3
3
+ Version: 0.5.4
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -19,6 +19,7 @@ Requires-Dist: torch
19
19
  Requires-Dist: absl-py
20
20
  Requires-Dist: accelerate
21
21
  Requires-Dist: cachetools
22
+ Requires-Dist: datasets (<3.0.0,>=2.18.0)
22
23
  Requires-Dist: editdistance
23
24
  Requires-Dist: jsonlines
24
25
  Requires-Dist: matplotlib
@@ -42,7 +43,7 @@ Requires-Dist: simple-ddl-parser
42
43
  Requires-Dist: tabulate
43
44
  Requires-Dist: tiktoken
44
45
  Requires-Dist: tqdm
45
- Requires-Dist: transformers (<4.43,>=4.33)
46
+ Requires-Dist: transformers (>=4.33)
46
47
  Requires-Dist: transformers-stream-generator
47
48
  Requires-Dist: jieba
48
49
  Requires-Dist: rouge-chinese
@@ -51,6 +52,7 @@ Requires-Dist: torch ; extra == 'all'
51
52
  Requires-Dist: absl-py ; extra == 'all'
52
53
  Requires-Dist: accelerate ; extra == 'all'
53
54
  Requires-Dist: cachetools ; extra == 'all'
55
+ Requires-Dist: datasets (<3.0.0,>=2.18.0) ; extra == 'all'
54
56
  Requires-Dist: editdistance ; extra == 'all'
55
57
  Requires-Dist: jsonlines ; extra == 'all'
56
58
  Requires-Dist: matplotlib ; extra == 'all'
@@ -74,7 +76,7 @@ Requires-Dist: simple-ddl-parser ; extra == 'all'
74
76
  Requires-Dist: tabulate ; extra == 'all'
75
77
  Requires-Dist: tiktoken ; extra == 'all'
76
78
  Requires-Dist: tqdm ; extra == 'all'
77
- Requires-Dist: transformers (<4.43,>=4.33) ; extra == 'all'
79
+ Requires-Dist: transformers (>=4.33) ; extra == 'all'
78
80
  Requires-Dist: transformers-stream-generator ; extra == 'all'
79
81
  Requires-Dist: jieba ; extra == 'all'
80
82
  Requires-Dist: rouge-chinese ; extra == 'all'
@@ -160,7 +162,9 @@ Large Model (including Large Language Models, Multi-modal Large Language Models)
160
162
  - **VLMEvalKit Integration**: Supports VLMEvalKit as the evaluation backend, facilitating the initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
161
163
  - **Full-Link Support**: Through seamless integration with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, provides a one-stop development process for model training, model deployment, model evaluation, and report viewing, enhancing user development efficiency.
162
164
 
163
- ### Overall Architecture
165
+
166
+ <details><summary>Overall Architecture</summary>
167
+
164
168
  <p align="center">
165
169
  <img src="docs/en/_static/images/evalscope_framework.png" width="70%">
166
170
  <br>Fig 1. EvalScope Framework.
@@ -177,14 +181,20 @@ The architecture includes the following modules:
177
181
  4. **Performance Evaluator**: Model performance evaluation, responsible for measuring model inference service performance, including performance testing, stress testing, performance report generation, and visualization.
178
182
  5. **Evaluation Report**: The final generated evaluation report summarizes the model's performance, which can be used for decision-making and further model optimization.
179
183
  6. **Visualization**: Visualization results help users intuitively understand evaluation results, facilitating analysis and comparison of different model performances.
184
+ </details>
185
+
180
186
 
181
187
  ## 🎉 News
182
- - **[2024.08.09]** Simplified installation process, supporting PyPI installation for vlmeval dependencies; Optimized multi-modal models evaluation experience with pipeline that based on OpenAI API, achieving up to 10x acceleration 🚀🚀🚀
183
- - **[2024.07.31]** Breaking change: The sdk name has been changed from `llmuses` to `evalscope`, please update the sdk name in your code.
184
- - **[2024.07.26]** Supports **VLMEvalKit** as a third-party evaluation framework, initiating multimodal model evaluation tasks. 🔥🔥🔥
185
- - **[2024.06.29]** Supports **OpenCompass** as a third-party evaluation framework. We have provided a high-level wrapper, supporting installation via pip and simplifying the evaluation task configuration. 🔥🔥🔥
186
- - **[2024.06.13]** EvalScope has been updated to version 0.3.x, which supports the ModelScope SWIFT framework for LLMs evaluation. 🚀🚀🚀
187
- - **[2024.06.13]** We have supported the ToolBench as a third-party evaluation backend for Agents evaluation. 🚀🚀🚀
188
+ - 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
189
+ - 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
190
+ - 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
191
+ - 🔥 **[2024.08.20]** Updated the official documentation, including getting started guides, best practices, and FAQs. Feel free to [📖read it here](https://evalscope.readthedocs.io/en/latest/)!
192
+ - 🔥 **[2024.08.09]** Simplified the installation process, allowing for pypi installation of vlmeval dependencies; optimized the multimodal model evaluation experience, achieving up to 10x acceleration based on the OpenAI API evaluation chain.
193
+ - 🔥 **[2024.07.31]** Important change: The package name `llmuses` has been changed to `evalscope`. Please update your code accordingly.
194
+ - 🔥 **[2024.07.26]** Support for **VLMEvalKit** as a third-party evaluation framework to initiate multimodal model evaluation tasks.
195
+ - 🔥 **[2024.06.29]** Support for **OpenCompass** as a third-party evaluation framework, which we have encapsulated at a higher level, supporting pip installation and simplifying evaluation task configuration.
196
+ - 🔥 **[2024.06.13]** EvalScope seamlessly integrates with the fine-tuning framework SWIFT, providing full-chain support from LLM training to evaluation.
197
+ - 🔥 **[2024.06.13]** Integrated the Agent evaluation dataset ToolBench.
188
198
 
189
199
 
190
200
 
@@ -264,8 +274,8 @@ If prompted with `Do you wish to run the custom code? [y/N]`, please type `y`.
264
274
 
265
275
  #### Basic Parameter Descriptions
266
276
  - `--model`: Specifies the `model_id` of the model on [ModelScope](https://modelscope.cn/), allowing automatic download. For example, see the [Qwen2-0.5B-Instruct model link](https://modelscope.cn/models/qwen/Qwen2-0.5B-Instruct/summary); you can also use a local path, such as `/path/to/model`.
267
- - `--template-type`: Specifies the template type corresponding to the model. Refer to the `Default Template` field in the [template table](https://swift.readthedocs.io/en/latest/LLM/Supported-models-datasets.html) for filling in this field.
268
- - `--datasets`: The dataset name, allowing multiple datasets to be specified, separated by spaces; these datasets will be automatically downloaded. Refer to the [supported datasets list](#supported-datasets-list) for available options.
277
+ - `--template-type`: Specifies the template type corresponding to the model. Refer to the `Default Template` field in the [template table](https://swift.readthedocs.io/en/latest/Instruction/Supported-models-datasets.html#llm) for filling in this field.
278
+ - `--datasets`: The dataset name, allowing multiple datasets to be specified, separated by spaces; these datasets will be automatically downloaded. Refer to the [supported datasets list](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html) for available options.
269
279
 
270
280
  ### 2. Parameterized Evaluation
271
281
  If you wish to conduct a more customized evaluation, such as modifying model parameters or dataset parameters, you can use the following commands:
@@ -275,8 +285,8 @@ If you wish to conduct a more customized evaluation, such as modifying model par
275
285
  python evalscope/run.py \
276
286
  --model qwen/Qwen2-0.5B-Instruct \
277
287
  --template-type qwen \
278
- --model-args revision=v1.0.2,precision=torch.float16,device_map=auto \
279
- --datasets mmlu ceval \
288
+ --model-args revision=master,precision=torch.float16,device_map=auto \
289
+ --datasets gsm8k ceval \
280
290
  --use-cache true \
281
291
  --limit 10
282
292
  ```
@@ -341,24 +351,6 @@ from evalscope.run import run_task
341
351
  run_task(task_cfg=your_task_cfg)
342
352
  ```
343
353
 
344
- ### Supported Datasets List
345
- > [!NOTE]
346
- > The framework currently supports the following datasets. If the dataset you need is not in the list, please submit an issue, or use the [OpenCompass backend](https://evalscope.readthedocs.io/en/latest/user_guides/opencompass_backend.html) for evaluation, or use the [VLMEvalKit backend](https://evalscope.readthedocs.io/en/latest/user_guides/vlmevalkit_backend.html) for multi-modal model evaluation.
347
-
348
- | Dataset Name | Link | Status | Note |
349
- |--------------------|----------------------------------------------------------------------------------------|--------|------|
350
- | `mmlu` | [mmlu](https://modelscope.cn/datasets/modelscope/mmlu/summary) | Active | |
351
- | `ceval` | [ceval](https://modelscope.cn/datasets/modelscope/ceval-exam/summary) | Active | |
352
- | `gsm8k` | [gsm8k](https://modelscope.cn/datasets/modelscope/gsm8k/summary) | Active | |
353
- | `arc` | [arc](https://modelscope.cn/datasets/modelscope/ai2_arc/summary) | Active | |
354
- | `hellaswag` | [hellaswag](https://modelscope.cn/datasets/modelscope/hellaswag/summary) | Active | |
355
- | `truthful_qa` | [truthful_qa](https://modelscope.cn/datasets/modelscope/truthful_qa/summary) | Active | |
356
- | `competition_math` | [competition_math](https://modelscope.cn/datasets/modelscope/competition_math/summary) | Active | |
357
- | `humaneval` | [humaneval](https://modelscope.cn/datasets/modelscope/humaneval/summary) | Active | |
358
- | `bbh` | [bbh](https://modelscope.cn/datasets/modelscope/bbh/summary) | Active | |
359
- | `race` | [race](https://modelscope.cn/datasets/modelscope/race/summary) | Active | |
360
- | `trivia_qa` | [trivia_qa](https://modelscope.cn/datasets/modelscope/trivia_qa/summary) | To be integrated | |
361
-
362
354
 
363
355
  ## Evaluation Backend
364
356
  EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
@@ -6,20 +6,20 @@ evalscope/run.py,sha256=T-2zoJpBx6YxLnLJH-iFF3UxUGYTU36PMV_DQ9e8tSM,18484
6
6
  evalscope/run_arena.py,sha256=BCWCAiX0BQ9pLMIq08svEcd-IoFr75gFShpV88robIY,8963
7
7
  evalscope/run_ms.py,sha256=UtJoGnah64SXigTawJQWTi_TEGjr7Td0rjCTaO-htL8,6028
8
8
  evalscope/summarizer.py,sha256=rIyML8HpjQxIpXg8KvQ0CzOS6xMS-JHZh6kUZzkaRsk,6640
9
- evalscope/version.py,sha256=0WQd7LO3Ug6-wMC2jG2UmV0H5mWaZ-7KHtoHQB-djLc,118
9
+ evalscope/version.py,sha256=yeILiwZ11YPUOCKr-BHfiOuKFIWppiHTIqg22wbfo_g,118
10
10
  evalscope/backend/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
11
11
  evalscope/backend/base.py,sha256=5BLrDNNwxsGp35zorD-kphmN15tlBbkuuqwkz8jWZq0,876
12
12
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
13
13
  evalscope/backend/opencompass/api_meta_template.py,sha256=sBW0XbVDOKeJ7mVUDLhmcG4e0yClw3eluazdp_8wtgQ,1753
14
- evalscope/backend/opencompass/backend_manager.py,sha256=Rr8eFFDUXTxI8AMcrbFW9LZuSQVZ7tsgHcZ1veNhfWM,10190
14
+ evalscope/backend/opencompass/backend_manager.py,sha256=_eg82FLAVxQ6t5e1OqlyuxZcngqD8rxvI5EijLUh_zI,10294
15
15
  evalscope/backend/opencompass/tasks/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
16
16
  evalscope/backend/opencompass/tasks/eval_api.py,sha256=12lrgDpMzZ1XBRboq5TEOovDPCMDwwGCJoRT78Ox_yo,1108
17
17
  evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=EizugDMt-ontWsTOaM61XGLUkx-S9rzdLf2Ssfmw3Yc,5263
18
18
  evalscope/backend/vlm_eval_kit/__init__.py,sha256=xTgHM95lWzh4s0W7zxLwYkgUbPAZfAb0UoGGmyyBXrs,83
19
- evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZQ1uyaHxLgjrrmbXepSCluvXudHlJycibs97Js1gg_o,6125
19
+ evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=k52qTUqkp1kJivKn8bVrKoF8cng4xYTQLUmjnH_CWPM,6080
20
20
  evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=zC40Jw9bIqcGKuWS9oKPAlQdBARc-zY3sJlSiU-u-sI,1625
21
21
  evalscope/benchmarks/__init__.py,sha256=6TKP35wfKf7R_h870fsEtcIlIAgomKOcukNL9M-5I1Y,162
22
- evalscope/benchmarks/benchmark.py,sha256=e7rA8Y_vo6q5BhlUbZGWfZ1-SfJnU2IFRg62pnjQtDk,2157
22
+ evalscope/benchmarks/benchmark.py,sha256=EmwYyFdrAHBGMkSbsMZQOR_62Q0CSKl8zeLlr7xvJdQ,2159
23
23
  evalscope/benchmarks/data_adapter.py,sha256=eVQvOQYQOQbIl8UlvOEUqRThL3FP3aUD6DSlqF1bqO0,10395
24
24
  evalscope/benchmarks/arc/__init__.py,sha256=7k2jFDUCHpEKDdQZ3Bmq59YmImFg9RyIfZQIsGebhE8,314
25
25
  evalscope/benchmarks/arc/ai2_arc.py,sha256=Wim8JsH094og7d0eLCEI0kUwDP_0x7AT117oTRPdiAI,5608
@@ -91,7 +91,7 @@ evalscope/cli/cli.py,sha256=uZ-qC8WBsLd5-Hn94d43sSGg0UC_12RebSD4ToKjypg,844
91
91
  evalscope/cli/start_perf.py,sha256=TL6bMXYl3ln-tfs5uBmzb9x94uxz6f3PBFIt1l7g3VA,994
92
92
  evalscope/cli/start_server.py,sha256=ATGLP2TE0aImJNicpehdzBuFlNb50F7KhyL4A_ZSoGU,3885
93
93
  evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
94
- evalscope/evaluator/evaluator.py,sha256=sWaJ2zkPFkSNuRAGdfhTqgF8nbtL1y55NQYHeBK8MG0,30715
94
+ evalscope/evaluator/evaluator.py,sha256=gB408byOpu269Psh6MjYC9-a_uv9GvThoT7t07Oqh6w,30712
95
95
  evalscope/evaluator/rating_eval.py,sha256=cJbkyXIuwFUZoe7ZJZM6eUskNd9zlORgndckuon2OQ8,5768
96
96
  evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
97
97
  evalscope/evaluator/reviewer/auto_reviewer.py,sha256=JycPYti9h1j_8DRcu_rc5U0wkEASHYg-XBqrUUoiO-Q,17054
@@ -108,6 +108,8 @@ evalscope/models/model.py,sha256=ZzzVzZHVzuzdt5F1r-rEBT44ZfW9B7R1spsrV-T8nSw,302
108
108
  evalscope/models/model_adapter.py,sha256=Cgs68ajRwTETEo1eU-OhFiFGuSx4eS1p7-JT3jOpcOk,22740
109
109
  evalscope/models/openai_model.py,sha256=PoQS1FIiWIxp1xBJPV7Bq81LFD9FIT3vAHUvNa22DCc,3452
110
110
  evalscope/models/template.py,sha256=Yk7-QnvjiLD0zchSZcaDSLmpW8onIeFpngSwtUOYVPk,56035
111
+ evalscope/models/api/__init__.py,sha256=0c75K78O1KaV02BqqtEp-hhtSSClXLawb8E0c2iqN_A,105
112
+ evalscope/models/api/openai_api.py,sha256=o-FVJFSvfk5mFJm4egXcKfR5ya1fduo5b-uqTkeRu9A,7871
111
113
  evalscope/models/custom/__init__.py,sha256=K4Ewo7Qrs73-jBuPq4ffxd8hMnttKhic-Zj0amH3wiU,103
112
114
  evalscope/models/custom/custom_model.py,sha256=2ivxfGQs5V5HDnQEhTBi5v8KNBxJDbzPVJdNOGo3iSg,1566
113
115
  evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -117,7 +119,7 @@ evalscope/perf/custom_api.py,sha256=H2IgM-LMjqXxVhbrtkXuiREb-p14zwMmllgl26a-jgw,
117
119
  evalscope/perf/dashscope_api.py,sha256=_XUF3czkYdPdVgtP7nqzRxROKxlqDjWs4DQnTyocNvM,3410
118
120
  evalscope/perf/dataset_plugin_base.py,sha256=6veUTyZ38W1Iig65vxNV9SfmqrsR8ID_UHgNiUO9Bv4,1814
119
121
  evalscope/perf/how_to_analysis_result.py,sha256=UVd_aYJ_7N5hl_wK9oIZig1vSwfgzodxW7XC6IWqbdg,1044
120
- evalscope/perf/http_client.py,sha256=y5t_LPqrxtYS315iz43-4Wu8wZgkftRgC3UlseSVKR0,34540
122
+ evalscope/perf/http_client.py,sha256=WYHuGY_BCeeh8vHi1fm9zrAndOKpVQp4h21j1kKnM64,34535
121
123
  evalscope/perf/openai_api.py,sha256=XrH6jg8VlO9Wu0vGwZna_bHq65XMAlCfCEyqMjs8w1c,5970
122
124
  evalscope/perf/plugin_registry.py,sha256=D2MG2AXDBScjuKxB4g_Hg026pSRO752dBimonYtaAzM,782
123
125
  evalscope/perf/query_parameters.py,sha256=HfGRZJSzRMVfPezWTvbWhYeprCetGNPX_M_paoDtuOY,1346
@@ -142,6 +144,18 @@ evalscope/registry/tasks/gsm8k.yaml,sha256=KYLK-xtv_3qtgCZiwwP4-rP_ftc_qUmtsl1Tf
142
144
  evalscope/registry/tasks/mmlu.yaml,sha256=504yhHVfi9pvUBk_SGPs-Yx7R2hx_2_-nAFiGIiFGx4,726
143
145
  evalscope/registry/tasks/mmlu_mini.yaml,sha256=wVbosZ5Tm9pwLG5nCphalezXilIjcq5j33nz3MR7_BE,778
144
146
  evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
147
+ evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
148
+ evalscope/third_party/longbench_write/eval.py,sha256=_fwV3f-Yq0qrkuZ6LBXvBiXnM6lpz6sOqd7BfYxEU80,11163
149
+ evalscope/third_party/longbench_write/infer.py,sha256=MB0MdSM1qDx15FyrPSU6BXPbSGnBjxuTWqrcHAgbj9o,8318
150
+ evalscope/third_party/longbench_write/longbench_write.py,sha256=MQzlIzv3sGlNgxgX0FPHtDIuAmgwThfBkMeKNcsR3U8,3926
151
+ evalscope/third_party/longbench_write/utils.py,sha256=l6q9cNZLFVRvG9qYbxFxobuQkcMyteU9Y6NxyMU4tmQ,816
152
+ evalscope/third_party/longbench_write/resources/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
153
+ evalscope/third_party/longbench_write/resources/judge.txt,sha256=LEI86IoOtqYUgvQnmXo8A8S8Ef6GEQKJXcrEWSauHVc,1884
154
+ evalscope/third_party/longbench_write/resources/longbench_write.jsonl,sha256=H26ZSXzCTWWJTWXgFAYvOYupRuvdJUt_izOeSNOrV3k,54155
155
+ evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl,sha256=h4AJJ3YfNA5IiZ5N9dR_tyEa1JNqY0INv6l5ZgQUJZ8,24235
156
+ evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl,sha256=odTr8N8PoWAFZ2kdEcmlLeMDfEo3KXDtLo9S8oieCmI,5718
157
+ evalscope/third_party/longbench_write/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
158
+ evalscope/third_party/longbench_write/tools/data_etl.py,sha256=fSc4iT7_bdTvW20TbjlWme-k1pLqj_e2wXV8z831_Yw,5963
145
159
  evalscope/third_party/toolbench_static/__init__.py,sha256=BO936RxwodHr4OEpV6W3S_keC91OfOd41_msIJ2d0fs,128
146
160
  evalscope/third_party/toolbench_static/eval.py,sha256=TqjMuuYePnD3bGRhQe1_9bIOlAW41kiFSztaEuppRLM,8237
147
161
  evalscope/third_party/toolbench_static/infer.py,sha256=WogwVXqDabdcsJ4uftZxAwR2wncp6HYpkS-fACEvjT4,9331
@@ -159,8 +173,8 @@ evalscope/utils/logger.py,sha256=Ycd0W17Z_oiByPuPX3_umNrOCHjT9O_e_Kws7ZWUSvU,185
159
173
  evalscope/utils/task_cfg_parser.py,sha256=LiNQ2X8lbZU0cODpaY_PbKyUhNoxZIC495UsLJigX64,138
160
174
  evalscope/utils/task_utils.py,sha256=Mv_u_f4Z91zcUeko6acZCmnOAPRfk61kf_dliLzG5Yk,459
161
175
  evalscope/utils/utils.py,sha256=zHo9hfxGBUVKE2xNMR7lDoEvfRnk4V4946DEfXQhlq4,20509
162
- evalscope-0.5.3.dist-info/METADATA,sha256=19GatH8y-jNjQbVX-IGuRb1g2VTjPBOs2dh9RVqrCCQ,21835
163
- evalscope-0.5.3.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
164
- evalscope-0.5.3.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
165
- evalscope-0.5.3.dist-info/top_level.txt,sha256=jNR-HMn3TR8Atolq7_4rW8IWVX6GhvYV5_1Y_KbJKlY,10
166
- evalscope-0.5.3.dist-info/RECORD,,
176
+ evalscope-0.5.4.dist-info/METADATA,sha256=WRfHi_dmJzaF5WXqB2vPz_HhciWfNZ8Su5j_dxbiKQE,20705
177
+ evalscope-0.5.4.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
178
+ evalscope-0.5.4.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
179
+ evalscope-0.5.4.dist-info/top_level.txt,sha256=jNR-HMn3TR8Atolq7_4rW8IWVX6GhvYV5_1Y_KbJKlY,10
180
+ evalscope-0.5.4.dist-info/RECORD,,